CUTLASS 3.8 Release (#2059)

* CUTLASS 3.8 Release * update * Update README.md * Revert "Update README.md" This reverts commit b353e36fe8. * update * update --------- Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2025-01-24 23:44:06 -08:00
parent 9eb01fa0b0
commit 389e493055
290 changed files with 91223 additions and 292 deletions
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@ -118,6 +118,7 @@ void FilterArchitecture() {
    { "SM80*",                      80, kMaxDevice},
    { "SM89*",                      89, 89},
    { "SM90*",                      90, 90},
+    { "SM100*",                    100, 100}, 
    { 0, 0, false }
  };

--- a/test/unit/core/numeric_conversion.cu
+++ b/test/unit/core/numeric_conversion.cu
@ -679,6 +679,11 @@ struct GetName<cutlass::float_e4m3_t> {
  static constexpr char name[] = "float_e4m3_t";
 };

+template <>
+struct GetName<cutlass::float_e5m2_t> {
+  static constexpr char name[] = "float_e5m2_t";
+};
+
 template <>
 struct GetName<cutlass::half_t> {
  static constexpr char name[] = "half_t";
@ -724,13 +729,20 @@ using VectorConvertTypes = ::testing::Types<
  ResultSourcePair<cutlass::bfloat16_t, uint8_t>,
  ResultSourcePair<cutlass::bfloat16_t, int8_t>,

+  ResultSourcePair<cutlass::float_e4m3_t, cutlass::int2b_t>,
+  ResultSourcePair<cutlass::float_e5m2_t, cutlass::int2b_t>,
  ResultSourcePair<cutlass::half_t, cutlass::int2b_t>,
  ResultSourcePair<cutlass::bfloat16_t, cutlass::int2b_t>,
+  ResultSourcePair<cutlass::float_e4m3_t, cutlass::uint2b_t>,
+  ResultSourcePair<cutlass::float_e5m2_t, cutlass::uint2b_t>,
  ResultSourcePair<cutlass::half_t, cutlass::uint2b_t>,
  ResultSourcePair<cutlass::bfloat16_t, cutlass::uint2b_t>,
+
  ResultSourcePair<cutlass::float_e4m3_t, cutlass::int4b_t>,
+  ResultSourcePair<cutlass::float_e5m2_t, cutlass::int4b_t>,
  ResultSourcePair<cutlass::half_t, cutlass::int4b_t>,
  ResultSourcePair<cutlass::bfloat16_t, cutlass::int4b_t>,
+  ResultSourcePair<cutlass::float_e4m3_t, cutlass::uint4b_t>,
  ResultSourcePair<cutlass::half_t, cutlass::uint4b_t>,
  ResultSourcePair<cutlass::bfloat16_t, cutlass::uint4b_t>,
  ResultSourcePair<float, cutlass::int4b_t>
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@ -29,6 +29,10 @@
 add_custom_target(cutlass_test_unit_gemm_device)
 add_custom_target(test_unit_gemm_device)

+
+add_subdirectory(sm100_blockscaled_tensorop_gemm)
+
+
 ################################################################################

 function(cutlass_test_unit_gemm_device_add_deps NAME)
@ -433,12 +437,12 @@ cutlass_test_unit_gemm_device_add_executable(
  gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
  gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu

+  sm80_gemm_f64_f64_f64_tensor_op_f64.cu
+  
  # SM90 device level tests
  gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
  gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu

-  sm80_gemm_f64_f64_f64_tensor_op_f64.cu
-
  gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
  gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
  gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
@ -821,3 +825,147 @@ if (CUTLASS_NVCC_DEVICE_COMPILE)

 endif()

+
+
+if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_sm100_fp16_gemm
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_gemm_f16_f16_f32_tensor_op_f32.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_sm100_stream_k
+
+  sm100_gemm_f16_f16_f16_tensor_op_f32_stream_k.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_sm100_bf16_gemm
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_gemm_bf16_bf16_f32_tensor_op_f32.cu
+)
+
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_stride_batch_alpha_beta_sm100
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_gemm_f8_f8_f8_tensor_op_s32_batch_alpha_beta.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_tensorop_runtime_datatype_sm100
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_gemm_f8_f8_f8_tensor_op_f32_runtime_datatype.cu
+  sm100_gemm_f6_f6_f32_tensor_op_f32_runtime_datatype.cu
+  sm100_gemm_f4_f4_f32_tensor_op_f32_runtime_datatype.cu
+  sm100_gemm_f8_f4_f32_tensor_op_f32_runtime_datatype.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_16b_tensorop_sm100_ptr_array
+
+  # 14 (9 + 5) unit tests 
+  sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
+  sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_16b_tensorop_sm100_group_gemm
+
+  sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_16b_mixed_tensorop_sm100_ptr_array
+
+  # 14 (9 + 5) unit tests 
+  sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
+  sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_32b_tensorop_sm100_ptr_array
+
+  # 10 unit tests 
+  sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_32b_tensorop_sm100_group_gemm
+
+  # 10 unit tests 
+  sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_8b_tensorop_sm100_ptr_array
+
+  # 12 unit tests
+  sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
+  sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_8b_tensorop_sm100_group_gemm
+
+  # 8 unit tests
+  sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_mxf8_training_sm100_group_gemm
+
+  # No batching of source to control compiler memory usage
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_gemm_device_mxf4xmxf8_sm100_group_gemm
+
+  # 8 unit tests
+  sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_blockscaled_gemm_device_fp4_tensorop_sm100_ptr_array
+
+  # 8 unit tests
+  sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
+)
+
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_blockscaled_gemm_device_fp4_tensorop_sm100_group_gemm_1
+
+  # 8 unit tests
+  sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
+)
+cutlass_test_unit_gemm_device_add_executable(
+  cutlass_test_unit_blockscaled_gemm_device_fp6_tensorop_sm100_ptr_array
+
+  # 8 unit tests
+  sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
+)
+endif()
+
+
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
--- a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@ -111,6 +111,18 @@ struct ElementScalarType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutpu
  using Type = typename Gemm::EpilogueOutputOp::ElementScalar;
 };

+
+template <typename Gemm, typename = void>
+struct IsF8F6F4Kernel {
+  static constexpr bool value = false;
+};
+
+template <typename Gemm>
+struct IsF8F6F4Kernel<Gemm, std::void_t<decltype(Gemm::GemmKernel::CollectiveMainloop::IsF8F6F4)>> {
+  static constexpr bool value = true;
+};
+
+
 // The maximum swizzle size to use
 //
 // This class, like Splits above makes it harder to confuse
@ -212,9 +224,26 @@ bool initialize_tensor(
      scope_max = 2;
      scope_min = 0;
    }
+    
+    else if (bits_input <= 6) {
+      scope_max = 2;
+      scope_min = -2;
+    }
+    
    else if (bits_input <= 8) {
+    
+      if constexpr (
+                    cute::is_same_v<Element, cutlass::float_ue8m0_t>){
+        scope_max = 4;
+        scope_min = 1;
+      }
+      else {
+    
        scope_max = 1;
        scope_min = -1;
+    
+      }
+    
    }
    else{
      scope_max = 4;
@ -487,6 +516,277 @@ struct HostCollectiveMainloop {
  }
 };

+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<
+                                SchedulerPipelineStageCount_,
+                                AccumulatorPipelineStageCount_>,
+                                Gemm, ElementA_, ElementB_> {
+  // Kernel data types
+  using ElementA = ElementA_;
+  using StrideA  = typename Gemm::GemmKernel::StrideA;
+  using InternalStrideA  = typename Gemm::GemmKernel::InternalStrideA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using InternalStrideB  = typename Gemm::GemmKernel::InternalStrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
+  
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+
+  static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
+
+  using ElementSF = typename Gemm::GemmKernel::ElementSF;
+  using Sm100BlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+  using Blk_MN   = typename Sm100BlkScaledConfig::Blk_MN;
+  using Blk_SF   = typename Sm100BlkScaledConfig::Blk_SF;
+  using SfAtom   = typename Sm100BlkScaledConfig::SfAtom;
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using InternalLayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using InternalLayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  std::vector<InternalStrideA> stride_a_host;
+  std::vector<InternalStrideB> stride_b_host;
+  cutlass::DeviceAllocation<InternalStrideA> stride_a_device;
+  cutlass::DeviceAllocation<InternalStrideB> stride_b_device;
+
+  std::vector<InternalLayoutSFA> layout_sfa_host;
+  std::vector<InternalLayoutSFB> layout_sfb_host;
+  cutlass::DeviceAllocation<InternalLayoutSFA> layout_sfa_device;
+  cutlass::DeviceAllocation<InternalLayoutSFB> layout_sfb_device;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  std::vector<cutlass::HostTensor<ElementA, LayoutTagA>> tensors_A;
+  std::vector<cutlass::HostTensor<ElementB, LayoutTagB>> tensors_B;
+  std::vector<cutlass::HostTensor<ElementSF, LayoutTagA>> tensors_SFA;
+  std::vector<cutlass::HostTensor<ElementSF, LayoutTagB>> tensors_SFB;
+
+  cutlass::DeviceAllocation<const ElementA *> device_tensors_A;
+  cutlass::DeviceAllocation<const ElementB *> device_tensors_B;
+  cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFA;
+  cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFB;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<InternalStrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
+  ):
+    check_relative_equality(check_relative_equality_),
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    init_A(init_A_), init_B(init_B_), seed(seed_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_shapes) {
+    //
+    // Allocate the GEMM workspace
+    //
+    tensors_A.clear();
+    tensors_B.clear();
+    stride_a_host.clear();
+    stride_b_host.clear();
+    tensors_SFA.clear();
+    tensors_SFB.clear();
+    layout_sfa_host.clear();
+    layout_sfb_host.clear();
+    
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_a_host.push_back(cutlass::make_cute_packed_stride(InternalStrideA{}, {M, K, 1}));
+      stride_b_host.push_back(cutlass::make_cute_packed_stride(InternalStrideB{}, {N, K, 1}));
+      
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto a_coord = cutlass::make_Coord(M, K);
+      // Cutlass has Row/Col major refers to MxK times KxN matrix product,
+      // so the HostTensorB should be treated as KxN in "coord"'s view
+      auto b_coord = cutlass::make_Coord(K, N);
+
+      tensors_A.push_back(cutlass::HostTensor<ElementA, LayoutTagA>(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A)));
+      tensors_B.push_back(cutlass::HostTensor<ElementB, LayoutTagB>(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B)));
+
+      EXPECT_TRUE(initialize_tensor(tensors_A[i].host_view(), init_A, seed + 2022 + i));
+      EXPECT_TRUE(initialize_tensor(tensors_B[i].host_view(), init_B, seed + 2021 + i));
+
+      // It is possible to randomly initialize to all zeros, so override this with non-zeros
+      // in the upper left corner of each operand.
+      tensors_A[i].host_view().at({0, 0}) = ElementA(1);
+      tensors_B[i].host_view().at({0, 0}) = ElementB(1);
+
+      tensors_A[i].sync_device();
+      tensors_B[i].sync_device();
+
+      using namespace cute;
+      
+      auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
+      auto m_blks = cutlass::ceil_div(M, Blk_MN{});
+      auto n_blks = cutlass::ceil_div(N, Blk_MN{});
+      layout_sfa_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1)));
+      layout_sfb_host.push_back(Sm100BlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1)));
+    
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto sfa_coord   = cutlass::make_Coord(m_blks * Blk_MN{}, k_blks * Blk_SF{});
+      auto sfb_coord   = cutlass::make_Coord(n_blks * Blk_MN{}, k_blks * Blk_SF{});
+
+      tensors_SFA.push_back(cutlass::HostTensor<ElementSF, LayoutTagA>(sfa_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(sfa_coord, stride_factor_A)));
+      tensors_SFB.push_back(cutlass::HostTensor<ElementSF, LayoutTagB>(sfb_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(sfb_coord, stride_factor_B)));
+
+      EXPECT_TRUE(initialize_tensor(tensors_SFA[i].host_view(), init_A, seed + 2024 + i));
+      EXPECT_TRUE(initialize_tensor(tensors_SFB[i].host_view(), init_B, seed + 2025 + i));
+
+      // It is possible to randomly initialize to all zeros, so override this with non-zeros
+      // in the upper left corner of each operand.
+      tensors_SFA[i].host_view().at({0, 0}) = ElementSF(1);
+      tensors_SFB[i].host_view().at({0, 0}) = ElementSF(1);
+
+      tensors_SFA[i].sync_device();
+      tensors_SFB[i].sync_device();
+    }
+
+    return true;
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    std::vector<ElementA *> ptr_A_host(L);
+    std::vector<ElementB *> ptr_B_host(L);
+    std::vector<ElementSF *> ptr_SFA_host(L);
+    std::vector<ElementSF *> ptr_SFB_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_A_host.at(i) = tensors_A[i].device_data();
+      ptr_B_host.at(i) = tensors_B[i].device_data();
+      ptr_SFA_host.at(i) = tensors_SFA[i].device_data();
+      ptr_SFB_host.at(i) = tensors_SFB[i].device_data();
+    }
+
+    device_tensors_A.reset(L);
+    device_tensors_A.copy_from_host(ptr_A_host.data());
+
+    device_tensors_B.reset(L);
+    device_tensors_B.copy_from_host(ptr_B_host.data());
+
+    device_tensors_SFA.reset(L);
+    device_tensors_SFA.copy_from_host(ptr_SFA_host.data());
+
+    device_tensors_SFB.reset(L);
+    device_tensors_SFB.copy_from_host(ptr_SFB_host.data());
+
+    stride_a_device.reset(problem_shapes.groups());
+    stride_a_device.copy_from_host(stride_a_host.data());
+    
+    stride_b_device.reset(problem_shapes.groups());
+    stride_b_device.copy_from_host(stride_b_host.data());
+
+    layout_sfa_device.reset(problem_shapes.groups());
+    layout_sfa_device.copy_from_host(layout_sfa_host.data());
+    
+    layout_sfb_device.reset(problem_shapes.groups());
+    layout_sfb_device.copy_from_host(layout_sfb_host.data());
+
+    if constexpr (IsGroupGemm) {
+      return Arguments{
+        device_tensors_A.get(), stride_a_device.get(),
+        device_tensors_B.get(), stride_b_device.get(),
+        device_tensors_SFA.get(), layout_sfa_device.get(),
+        device_tensors_SFB.get(), layout_sfb_device.get()   
+      };
+    } 
+    else {
+      return Arguments{
+        device_tensors_A.get(), stride_a_host[0],
+        device_tensors_B.get(), stride_b_host[0],
+        device_tensors_SFA.get(), layout_sfa_host[0],
+        device_tensors_SFB.get(), layout_sfb_host[0]   
+      };
+    }
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    auto A = make_tensor(make_iterator(tensors_A[batch].host_data()),
+          make_layout(make_shape(M, K, 1), stride_a_host[batch]));
+    auto SfA = make_tensor(tensors_SFA[batch].host_data(), layout_sfa_host[batch]);
+
+    auto B = make_tensor(make_iterator(tensors_B[batch].host_data()),
+        make_layout(make_shape(N, K, 1), stride_b_host[batch]));
+    auto SfB = make_tensor(tensors_SFB[batch].host_data(), layout_sfb_host[batch]);
+
+    return cutlass::reference::host::GettMainloopParams<ElementAccumulator, 
+        decltype(A),  
+        decltype(B), 
+        decltype(SfA), 
+        decltype(SfB)
+      > 
+      {A, SfA, B, SfB};
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    file << "A =\n" << tensors_A[batch].host_view()
+         << "\nB =\n" << tensors_B[batch].host_view()
+         << "\nSFA =\n" << tensors_SFA[batch].host_view()
+         << "\nSFB =\n" << tensors_SFB[batch].host_view();
+  }
+
+  bool compare_reference(
+      ProblemShapeType problem_shapes, int batch) {
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_A[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_B[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFA[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFB[batch].host_view()), 0);
+    return true;
+  }
+};
+
+
 template<class Gemm>
 struct HostCollectiveDefaultEpilogue {
  // fusion types are potentially void if the fusion is not supported
@ -803,6 +1103,24 @@ struct HostCollectiveEpilogue {
  using FusionOp = typename Gemm::EpilogueOutputOp;
  static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp>);

+  
+  // Scale factor Generation related
+  using SfStrategy = cutlass::reference::host::SfStrategy;
+  static constexpr bool IsBlockScaleSupported            = FusionOp::IsBlockScaleSupported;
+  static constexpr SfStrategy SfGenStrategy              = (!IsBlockScaleSupported) ? SfStrategy::None : SfStrategy::SfDGen;
+  static constexpr int32_t SFD_VectorSize = IsBlockScaleSupported ? FusionOp::SFVecSize : 1;
+  using ElementSFD = non_void_t<cute::remove_pointer_t<typename FusionOp::ElementBlockScaleFactor>, ElementD>;
+  using Sm100BlockScaledOutputConfig = cutlass::detail::Sm100BlockScaledOutputConfig<
+                                          SFD_VectorSize
+                                        >;
+  using Blk_MN = typename Sm100BlockScaledOutputConfig::Blk_MN;
+  using Blk_SF = typename Sm100BlockScaledOutputConfig::Blk_SF; 
+  using OutputSFAtom = typename Sm100BlockScaledOutputConfig::SfAtom;
+  std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> tensors_SFD;
+  std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> references_SFD;
+  cutlass::DeviceAllocation<ElementSFD *> device_tensors_SFD;
+  
+
  using ElementCompute    = typename FusionOp::ElementCompute;
  using ElementScalar     = typename FusionOp::ElementScalar;
  using ElementBias       = non_void_t<typename FusionOp::ElementBias>;
@ -904,6 +1222,11 @@ struct HostCollectiveEpilogue {
    references_D.clear();
    stride_c_host.clear();
    stride_d_host.clear();
+    
+    tensors_SFD.clear();
+    references_SFD.clear();
+    
+
    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
    L = std::max(problem_shapes.groups(), L);

@ -1034,6 +1357,26 @@ struct HostCollectiveEpilogue {
      }
    }

+    
+    if constexpr (IsBlockScaleSupported) {
+      for (int32_t i = 0; i < L; ++i) {
+        auto [M, N, K, _] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        // If block scaled output is supported we always have at least 1 SFD
+        auto m_blks = cutlass::ceil_div(M, cute::size<0>(cute::shape(OutputSFAtom{})));
+        auto n_blks = cutlass::ceil_div(N, cute::size<1>(cute::shape(OutputSFAtom{})));
+        auto sfd_coord = [&] () {
+            return cutlass::make_Coord(m_blks * Blk_MN{}, n_blks * Blk_SF{});
+        }();
+        tensors_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D)));
+        references_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D), false));
+        tensors_SFD[i].sync_device();
+      }
+      norm_constant.resize(scalar_coord, true);
+      EXPECT_TRUE(initialize_tensor(norm_constant.host_view(), init_scale, seed + 2023));
+      norm_constant.sync_device();
+    }
+    
+
    return true;
  }

@ -1116,6 +1459,17 @@ struct HostCollectiveEpilogue {
        passed &= tmp;
      }
    }
+    
+    if constexpr (IsBlockScaleSupported) {
+      tensors_SFD[batch].sync_host();
+      bool passed_sf = equality_check(references_SFD[batch].host_view(), tensors_SFD[batch].host_view());
+      if(!passed_sf) {
+        std::cout<<"SF is incorrect"<<std::endl;  
+      }
+      passed &= passed_sf;
+    }
+    
+
    return passed;
  }

@ -1308,6 +1662,19 @@ struct HostCollectiveEpilogue {
          fusion_args.amax_aux_ptr = abs_max_Aux.device_data();
        }
      }
+      
+      if constexpr (IsBlockScaleSupported) {
+        std::vector<ElementSFD *> ptr_SFD_host(L);
+        for (int32_t i = 0; i < L; ++i) {
+          ptr_SFD_host.at(i) = tensors_SFD[i].device_data();
+        }
+        device_tensors_SFD.reset(L);
+        device_tensors_SFD.copy_from_host(ptr_SFD_host.data());
+
+        arguments.thread.block_scale_factor_ptr = device_tensors_SFD.get();
+        arguments.thread.norm_constant_ptr = norm_constant.device_data();
+      }
+      
    }

    return arguments;
@ -1341,6 +1708,20 @@ struct HostCollectiveEpilogue {
        cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, M)));
    auto Vbeta = cute::make_tensor(detail::make_iterator(beta.host_data()),
        cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, N)));
+    
+    auto SfD = [&](){
+      if constexpr (IsBlockScaleSupported) {
+        auto tensor = make_tensor(detail::make_iterator(references_SFD[batch].host_data()),
+          Sm100BlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+        return tensor;
+      }
+      else {
+        // Reference kernel has a logic to ignore scalefactor computation if we pass the tensor type same as output D tensor.
+        return D;
+      }
+    }();
+    
+
    cutlass::reference::host::GettEpilogueParams<
      ElementScalar,
      ElementScalar,
@ -1353,8 +1734,11 @@ struct HostCollectiveEpilogue {
      decltype(Valpha),
      decltype(Vbeta),
      ActivationFunctor
+      , decltype(SfD)                 
+      , Int<SFD_VectorSize>           
      , cutlass::plus<ElementCompute>
      , false
+      , SfGenStrategy                 
    > epilogue_params{};

    epilogue_params.C = C;
@ -1397,6 +1781,12 @@ struct HostCollectiveEpilogue {
        epilogue_params.Vbeta = Vbeta;
      }
    }
+    
+    if constexpr (IsBlockScaleSupported) {
+      epilogue_params.SfD = SfD;
+      epilogue_params.st = norm_constant.at(coord_0);
+    }
+    
    return epilogue_params;
  }
 };
@ -1812,8 +2202,24 @@ bool TestSmall(double alpha = 1.0, double beta = 1.0,
  using ElementB = typename Gemm::GemmKernel::ElementB;
  using TiledMma = typename Gemm::GemmKernel::TiledMma;
  int alignment_bits = 128;
+  
+  static constexpr bool IsF8F6F4 = cutlass::gemm::collective::detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+  alignment_bits = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+  // For fp4 and fp6 mx kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.
+  
  int alignment_input = (alignment_bits / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits / cute::sizeof_bits<ElementA>::value);

+  
+  if constexpr (apply_alignment_offset) {
+    // If BlockScaled, then min alignment is SFVecSize
+    static constexpr bool IsBlockScaleSupported = Gemm::EpilogueOutputOp::IsBlockScaleSupported;
+    static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
+    if constexpr (IsBlockScaleSupported) {
+      alignment_input = cutlass::round_up(alignment_input, SFVecSize);
+    }
+  }
+  
+
  using CtaShape_MNK = typename Gemm::GemmKernel::CollectiveMainloop::CtaShape_MNK;
  using DispatchPolicy = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy;
  CtaShape_MNK cta_shape;
--- a/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
@ -258,6 +258,12 @@ struct Testbed3xTensorBroadcast {
        cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
    auto dummy_Vbeta = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
        cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
+    
+    auto dummy_SFD = cute::make_tensor(static_cast<ElementD*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+    using DummySFDVectorSize = cute::Int<0>;
+    
+
    cutlass::reference::host::GettEpilogueParams<
        ElementScalar,
        ElementScalar,
@ -270,6 +276,8 @@ struct Testbed3xTensorBroadcast {
        decltype(dummy_Valpha),
        decltype(dummy_Vbeta),
        ActivationFunctor,
+        decltype(dummy_SFD),            
+        DummySFDVectorSize,             
        cutlass::plus<ElementCompute>,
        PerColBias> epilogue_params{
          alpha,
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/CMakeLists.txt
@ -0,0 +1,150 @@
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+
+#
+
+if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
+add_custom_target(
+  cutlass_test_unit_gemm_device_sm100_blockscaled
+  DEPENDS
+  cutlass_test_unit_gemm_device_bstensorop_sm100_nvf4xnvf4
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_nvf4xnvf4
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  nvf4_nvf4_bf16_bf16.cu
+  nvf4_nvf4_bf16_bf16_features.cu
+  nvf4_nvf4_f16_nvfp4_epilogue.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf4
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf4_mxf4_void_f16_tn_layout.cu
+  mxf4_mxf4_void_f16_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf6
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf6_mxf6_void_bf16_tn_layout.cu
+  mxf6_mxf6_void_bf16_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf8
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf8_mxf8_void_f8_tn_layout.cu
+  mxf8_mxf8_void_f8_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf8
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf6_mxf8_void_f32_tn_layout.cu
+  mxf6_mxf8_void_f32_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf6
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf8_mxf6_f16_f8_tn_layout.cu
+  mxf8_mxf6_f16_f8_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf8
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf4_mxf8_bf16_bf16_tn_layout.cu
+  mxf4_mxf8_bf16_bf16_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf8xmxf4
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf8_mxf4_f16_bf16_tn_layout.cu
+  mxf8_mxf4_f16_bf16_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf6xmxf4
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf6_mxf4_f16_f16_tn_layout.cu
+  mxf6_mxf4_f16_f16_nt_layout.cu
+)
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_bstensorop_sm100_mxf4xmxf6
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 1
+
+  mxf4_mxf6_f32_f16_tn_layout.cu
+  mxf4_mxf6_f32_f16_nt_layout.cu
+)
+
+endif()
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_nt_layout.cu
@ -0,0 +1,303 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp4xmxfp4 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp4 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
+      |--------|---------------|----|----|-------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y     | Y  |
+      | 1SM    | 128x192x128   | Y  | N  | N     | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y     | Y  |
+      | 2SM    | 256x128x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x192x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y     | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128x128x256_1x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 128x256x256_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m1t_void_f16t_bstensorop_f32, 256x256x256_4x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf4_void_f16_tn_layout.cu
@ -0,0 +1,523 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp4xmxfp4 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp4 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
+      |--------|---------------|-------|----|----|----|
+      | 1SM    | 128x128x128   | Y     | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y     | N  | N  | Y  |
+      | 1SM    | 128x256x128   | Y     | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x192x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x256x128   | Y     | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x128x256_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x192x256_1x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 128x256x256_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x192x256_2x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy - underlying selection is KernelTmaWarpSpecialized1SmMxf4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_void_f16t_bstensorop_f32, 256x256x256_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_nt_layout.cu
@ -0,0 +1,304 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp4xmxfp6 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
+      |--------|---------------|----|----|-------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y     | Y  |
+      | 1SM    | 128x192x128   | Y  | N  | N     | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y     | Y  |
+      | 2SM    | 256x128x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x192x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y     | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f32_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = float;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe2m3t_f16_f16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf6_f32_f16_tn_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp4xmxfp6 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
+      |--------|---------------|-------|----|----|----|
+      | 1SM    | 128x128x128   | Y     | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y     | N  | N  | Y  |
+      | 1SM    | 128x256x128   | Y     | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x192x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x256x128   | Y     | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f32_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = float;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 128x192x128_2x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = float;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x128x128_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = float;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe3m2n_f32_f16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = float;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m3n_f16_f16t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_nt_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp4xmxfp8 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 16 elements
+    * Mma Tile Shapes supported:
+      For the A tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
+      |--------|---------------|----|----|--------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x192x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x128x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x192x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y      | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 128x192x128_1x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe4m3t_bf16_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1n_ue8m0xe5m2t_bf16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf4_mxf8_bf16_bf16_tn_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp4xmxfp8 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 16 elements
+    * Mma Tile Shapes supported:
+      For the A tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
+      |--------|---------------|--------|----|----|----|
+      | 1SM    | 128x128x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x256x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x192x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x256x128   | Y      | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 256x128x128_2x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe4m3n_bf16_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe5m2n_bf16_bf16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_nt_layout.cu
@ -0,0 +1,304 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp6xmxfp4 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * B tensor:
+      * Types: {e2m1}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
+      |--------|---------------|----|----|-------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y     | Y  |
+      | 1SM    | 128x192x128   | Y  | N  | N     | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y     | Y  |
+      | 2SM    | 256x128x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x192x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y     | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m1t_f16_f16t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf4_f16_f16_tn_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp6xmxfp4 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
+      |--------|---------------|-------|----|----|----|
+      | 1SM    | 128x128x128   | Y     | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y     | N  | N  | Y  |
+      | 1SM    | 128x256x128   | Y     | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x192x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x256x128   | Y     | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 128x256x128_1x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x128x128_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m1n_f16_f16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m1n_f16_f16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::half_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_nt_layout.cu
@ -0,0 +1,304 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp6xmxfp6 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp4 and mxfp6 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*)| NN |
+      |--------|---------------|----|----|-------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y     | Y  |
+      | 1SM    | 128x192x128   | Y  | N  | N     | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y     | Y  |
+      | 2SM    | 256x128x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x192x128   | Y  | N  | N     | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y     | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe2m3t_void_bf16t_bstensorop_f32, 256x256x128_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf6_void_bf16_tn_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp6xmxfp6 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported depends on the layout for mxfp6 mixed precision GEMM
+      The tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*)| TT | NT | NN |
+      |--------|---------------|-------|----|----|----|
+      | 1SM    | 128x128x128   | Y     | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y     | N  | N  | Y  |
+      | 1SM    | 128x256x128   | Y     | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x192x128   | Y     | N  | N  | Y  |
+      | 2SM    | 256x256x128   | Y     | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 256x128x128_2x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe3m2n_void_bf16t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe2m3n_void_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_nt_layout.cu
@ -0,0 +1,523 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp6xmxfp8 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 16 elements
+    * Mma Tile Shapes supported:
+      For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
+      |--------|---------------|----|----|--------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x192x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x128x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x192x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y      | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256x128x128_4x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3n_ue8m0xe4m3t_void_f32t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2n_ue8m0xe5m2t_void_f32t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf6_mxf8_void_f32_tn_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp6xmxfp8 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 16 elements
+    * Mma Tile Shapes supported:
+      For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
+      |--------|---------------|--------|----|----|----|
+      | 1SM    | 128x128x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x256x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x192x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x256x128   | Y      | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe3m2t_ue8m0xe4m3n_void_f32t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m3t_ue8m0xe5m2n_void_f32t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignA = 128;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 4;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = float;
+  constexpr int AlignD = 4;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_nt_layout.cu
@ -0,0 +1,304 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp8xmxfp4 Block Scaled Gemm
+
+    * A tensor:
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 16 elements
+    * B tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported:
+      For the B tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
+      |--------|---------------|----|----|--------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x192x128   | Y  | N  | N      | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x128x128   | Y  | N  | N      | Y  |
+      | 2SM    | 256x192x128   | Y  | N  | N      | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y      | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m1t_f16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf4_f16_bf16_tn_layout.cu
@ -0,0 +1,523 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp8xmxfp4 Block Scaled Gemm
+
+    * A tensor:
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 16 elements
+    * B tensor: 
+      * Types: {e2m1}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported:
+      For the B tensor (mxfp4 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
+      |--------|---------------|--------|----|----|----|
+      | 1SM    | 128x128x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y      | N  | N  | Y  |
+      | 1SM    | 128x256x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y      | N  | N  | Y  |
+      | 2SM    | 256x192x128   | Y      | N  | N  | Y  |
+      | 2SM    | 256x256x128   | Y      | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 128x256x128_4x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x192x128_2x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m1n_f16_bf16t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_nt_layout.cu
@ -0,0 +1,304 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp8xmxfp6 Block Scaled Gemm
+
+    * A tensor:
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 16 elements
+    * B tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported:
+      For the B tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
+      |--------|---------------|----|----|--------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x192x128   | Y  | N  | N      | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x128x128   | Y  | N  | N      | Y  |
+      | 2SM    | 256x192x128   | Y  | N  | N      | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y      | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x128x128_1x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe2m3t_f16_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf6_f16_f8_tn_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp8xmxfp6 Block Scaled Gemm
+
+    * A tensor:
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 16 elements
+    * B tensor: 
+      * Types: {e2m3,e3m2}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * Mma Tile Shapes supported:
+      For the B tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
+      |--------|---------------|--------|----|----|----|
+      | 1SM    | 128x128x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y      | N  | N  | Y  |
+      | 1SM    | 128x256x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y      | N  | N  | Y  |
+      | 2SM    | 256x192x128   | Y      | N  | N  | Y  |
+      | 2SM    | 256x256x128   | Y      | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe3m2n_f16_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e3m2_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe2m3n_f16_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float6_t<cutlass::float_e2m3_t>;
+  constexpr int AlignB = 128;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e4m3_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_nt_layout.cu
@ -0,0 +1,523 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp8xmxfp8 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 128 elements
+    * B tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 16 elements
+    * Mma Tile Shapes supported:
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN | TT | NT (*) | NN |
+      |--------|---------------|----|----|--------|----|
+      | 1SM    | 128x128x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x192x128   | Y  | Y  | Y      | Y  |
+      | 1SM    | 128x256x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x128x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x192x128   | Y  | Y  | Y      | Y  |
+      | 2SM    | 256x256x128   | Y  | Y  | Y      | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x128x128_1x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 128x192x128_1x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2n_ue8m0xe4m3t_void_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3n_ue8m0xe5m2t_void_f8t_bstensorop_f32, 256x256x128_4x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/mxf8_mxf8_void_f8_tn_layout.cu
@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for mxfp8xmxfp8 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Row Major (T)
+      * Alignment: 16 elements
+    * B tensor: 
+      * Types: {e5m2,e4m3}xue8m0
+      * Layout: Column Major (N)
+      * Alignment: 16 elements
+    * Mma Tile Shapes supported:
+      For the A tensor (mxfp6 type) the tile dimension with stride-1 should be divisible by 128, i.e., 128 element aligned.
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
+      |--------|---------------|--------|----|----|----|
+      | 1SM    | 128x128x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x192x128   | Y      | Y  | Y  | Y  |
+      | 1SM    | 128x256x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x128x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x192x128   | Y      | Y  | Y  | Y  |
+      | 2SM    | 256x256x128   | Y      | Y  | Y  | Y  |
+
+      (*) Unit tests in this file
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x128x128_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 128x192x128_2x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 128x256x128_4x2x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  // For N=256 using f32 and f16 consumes too much SMEM space for Epilogue.
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x128x128_4x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe5m2t_ue8m0xe4m3n_void_f8t_bstensorop_f32, 256x192x128_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_ue8m0xe4m3t_ue8m0xe5m2n_void_f8t_bstensorop_f32, 256x256x128_2x1x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  constexpr int AlignA = 16;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e5m2_t>;
+  constexpr int AlignB = 16;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = void;
+  constexpr int AlignC = 16;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e5m2_t;
+  constexpr int AlignD = 16;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_128>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_128>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16.cu
@ -0,0 +1,683 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit tests for nvfp4xnvfp4 Block Scaled Gemm
+
+    * A tensor: 
+      * Types: {e2m1}xue4m3
+      * Layout: Row Major (T)
+      * Alignment: 32 elements
+    * B tensor: 
+      * Types: {e2m1}xue4m3
+      * Layout: Column Major (N)
+      * Alignment: 32 elements
+    * Mma Tile Shapes supported:
+      Support Matrix (Y: Yes, N: No)
+      | 1/2 SM | Mma Tile Size | TN (*) | TT | NT | NN |
+      |--------|---------------|--------|----|----|----|
+      | 1SM    | 128x128x256   | Y      | N  | N  | N  |
+      | 1SM    | 128x192x256   | Y      | N  | N  | N  |
+      | 1SM    | 128x256x256   | Y      | N  | N  | N  |
+      | 2SM    | 256x128x256   | Y      | N  | N  | N  |
+      | 2SM    | 256x192x256   | Y      | N  | N  | N  |
+      | 2SM    | 256x256x256   | Y      | N  | N  | N  |
+
+      (*) Unit tests in this file
+*/
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_4x4x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x2x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// 
+//  Using targeted scheduling with **static** cluster shapes
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_2x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+  
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 
+// Using large Cta Tiles: N=192 and N=256
+// 
+//////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x192x256_2x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+  
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_192,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x256x256_2x1x1_1sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+  
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_256,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x192x256_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_192,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_192,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x256x256_2x4x1_2sm_auto) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+  
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_256,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_256,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_bf16_bf16_features.cu
@ -0,0 +1,374 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Runtime data type for blockscaled gemm fp4
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////
+// 
+// Using Runtime Types
+// 
+//////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_4x2x1_1sm_auto_runtime_dtypes) {
+  // Describe A and B tensors
+  using ElementA = cutlass::type_erased_dynamic_nv_float4_t;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::type_erased_dynamic_nv_float4_t;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF4Format::E2M1, cute::UMMA::MXF4Format::E2M1);
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x4x1_2sm_auto_runtime_dtypes) {
+  // Describe A and B tensors
+  using ElementA = cutlass::type_erased_dynamic_nv_float4_t;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::type_erased_dynamic_nv_float4_t;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF4Format::E2M1, cute::UMMA::MXF4Format::E2M1);
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 
+// Using Stream-K Scheduler
+// 
+//////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 128x128x256_1x4x1_1sm_auto_streamK) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_1,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  // Tile Scheduler
+  using TileScheduler = cutlass::gemm::StreamKScheduler;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      TileScheduler                                                        // Specify the streamK scheduler for the kernel
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_NVue4m3xe2m1t_NVue4m3xe2m1n_bf16t_bf16t_bstensorop_f32, 256x128x256_2x2x1_2sm_auto_streamK) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::bfloat16_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::bfloat16_t;
+  constexpr int AlignD = 8;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  
+  // Tile and cluster shapes
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  // Tile Scheduler
+  using TileScheduler = cutlass::gemm::StreamKScheduler;
+
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      TileScheduler                                                        // Specify the streamK scheduler for the kernel
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu
+++ b/test/unit/gemm/device/sm100_blockscaled_tensorop_gemm/nvf4_nvf4_f16_nvfp4_epilogue.cu
@ -0,0 +1,436 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Unit test for nvfp4 Block Scaled Gemm with nvfp4 output
+      D tensor: 
+        * Types: e2m1x{ue4m3}
+        * Layout: Column Major (T)
+        * Alignment: 32
+      * Scale factors need to be generated with the fp4 output.  It is generated along the continuous dimensions of the D tensor.
+      * Meanwhile, before scale factor generation, it could have other epilogue fusion operation.
+        * alpha
+        * beta
+        * activation
+        * bias
+        This UT tests 
+        - alpha + beta + scale-factor generation
+        - alpha + beta + bias + scale-factor generation
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../../common/cutlass_unit_test.h"
+
+#include "../gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+//////////////////////////////////////////////////////////////////////////////
+// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 16)
+//                  with alpha/beta fusion
+//////////////////////////////////////////////////////////////////////////////
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs16_bstensorop_1sm_f32, 128x128x256_4x4x1) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e2m1_t;
+  constexpr int AlignD = 32;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+  // Describe SFD tensor
+  using ElementSFD = cutlass::float_ue4m3_t;
+  using GmemLayoutSFD = GmemLayoutD;
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+
+  //
+  // Construct FusionOperation
+  //
+  constexpr int SFDVectorSize = 16;
+  // Define the fusion operation applied during epilogue
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+      SFDVectorSize,
+      ElementD, ElementCompute, 
+      ElementSFD, GmemLayoutSFD,
+      ElementC
+    >;
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass); 
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100Only_Device_Gemm_ue4m3xe2m1t_ue4m3xe2m1n_ue4m3xe2m1t_outputVs16_bstensorop_2sm_f32, 256x128x256_4x4x1) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e2m1_t;
+  constexpr int AlignD = 32;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+
+  // Describe SFD tensor
+  using ElementSFD = cutlass::float_ue4m3_t;
+  using GmemLayoutSFD = GmemLayoutD;
+
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_256,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+
+  //
+  // Construct FusionOperation
+  //
+  constexpr int SFDVectorSize = 16;
+  // Define the fusion operation applied during epilogue
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+      SFDVectorSize,
+      ElementD, ElementCompute, 
+      ElementSFD, GmemLayoutSFD,
+      ElementC
+    >;
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass); 
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 32)
+//                  with alpha/beta fusion
+//////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1t_outputVs32_bstensorop_1sm_f32, 128x128x256_4x4x1) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e2m1_t;
+  constexpr int AlignD = 32;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+  // Describe SFD tensor
+  using ElementSFD = cutlass::float_ue4m3_t;
+  using GmemLayoutSFD = GmemLayoutD;
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  //
+  // Construct FusionOperation
+  //
+  constexpr int SFDVectorSize = 32;
+  // Define the fusion operation applied during epilogue
+  using FusionOperation = cutlass::epilogue::fusion::LinCombBlockScaleFactor<
+      SFDVectorSize,
+      ElementD, ElementCompute, 
+      ElementSFD, GmemLayoutSFD,
+      ElementC
+    >;
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      PerSmTileShape_MNK, ClusterShape_MNK,                                     // Epilogue tile shape, and cluster shape
+      cutlass::epilogue::collective::EpilogueTileAuto,                      // Epilogue subtile shape. Auto will find a suitable tile shape
+      ElementAccumulator, ElementCompute,                                   // Mma instr's accumulator type and compute precision for epilogue
+      ElementC, GmemLayoutC, AlignC,                                        // C tensor description
+      ElementD, GmemLayoutD, AlignD,                                        // D tensor description
+      cutlass::epilogue::collective::EpilogueScheduleAuto                   // Epilogue schedule policy
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,      // Arch and Tensorop spec
+      ElementA, GmemLayoutA, AlignA,                                        // A tensor elem type, layout and alignment requirement
+      ElementB, GmemLayoutB, AlignB,                                        // B tensor elem type, layout and alignment requirement
+      ElementAccumulator,                                                   // Mma instruction accumulator type
+      MmaTileShape_MNK, ClusterShape_MNK,                                   // Mma instruction tile shape, cluster shape
+      // Epilogue's SMEM usage that needs to be subtracted from overall SMEM capacity 
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto    // Kernel schedule policy. Auto or using targeted scheduling policy
+    >::CollectiveOp;
+
+  // Create Gemm Kernel using CollectiveEpilogue and CollectiveMainloop created by the builders
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  // Run tests
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  // Check results
+  EXPECT_TRUE(pass); 
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// FusionOperation: k-major output and datatype is float_e2m1_t with float_ue4m3_t scale-factor (vecsize 16)
+//                  with alpha+beta+relu+bias fusion
+//////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100Only_Device_Gemm_ue8m0xe2m1t_ue8m0xe2m1n_ue8m0xe2m1n_outputVs16_bstensorop_1sm_f32_bias_relu, 128x128x256_4x4x1) {
+  // Describe A and B tensors
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignA = 32;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  constexpr int AlignB = 32;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  // Describe C and D tensors
+  using ElementC = cutlass::half_t;
+  constexpr int AlignC = 8;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ElementD = cutlass::float_e2m1_t;
+  constexpr int AlignD = 32;
+  using GmemLayoutD = cutlass::layout::RowMajor;
+  // Describe SFD tensor
+  using ElementSFD = cutlass::float_ue4m3_t;
+  using GmemLayoutSFD = GmemLayoutD;
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  // Bias type
+  using ElementBias = float;
+
+  // Collective MMA takes tile shape of the MMA operation as input
+  using MmaTileShape_MNK = Shape<_128,_128,_256>;
+  // Cluster size for multicast
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  // Collective Epilogue takes the output tile shape for 1 CTA
+  using PerSmTileShape_MNK = Shape<_128,_128,_256>;
+
+  // Mma's accumulator type
+  using ElementAccumulator = float;
+  // Epilogue computation's precision type
+  using ElementCompute = float;
+  constexpr int SFDVectorSize = 32;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasBlockScaleFactor<
+      SFDVectorSize, ElementD, ElementCompute, 
+      ElementSFD, GmemLayoutSFD,
+      ElementBias, ElementC
+    >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      PerSmTileShape_MNK, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, AlignC,
+      ElementD, GmemLayoutC, AlignD,
+      cutlass::epilogue::collective::EpilogueScheduleAuto,
+      FusionOperation
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA, AlignA,
+      ElementB, GmemLayoutB, AlignB,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestAll<Gemm>();
+  EXPECT_TRUE(pass);  
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_bf16_bf16_bf16_tensor_op_f32_ptr_array.cu
@ -0,0 +1,364 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_2sm_f32_ptr_array, 256x128x64_4x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::bfloat16_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::bfloat16_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::bfloat16_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_4,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::bfloat16_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::bfloat16_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::bfloat16_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::bfloat16_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::bfloat16_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::bfloat16_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_bf16t_bf16n_bf16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::bfloat16_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::bfloat16_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::bfloat16_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::bfloat16_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_bf16t_bf16n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_4x4x1) {
+// A matrix configuration
+using         ElementA    = cutlass::bfloat16_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::bfloat16_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_4,_4,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_bf16_bf16_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm100_gemm_bf16_bf16_f32_tensor_op_f32.cu
@ -0,0 +1,323 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+/// A Row B Col
+TEST(SM100_Device_Gemm_f16t_f16n_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 16,
+      ElementD, GmemLayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, GmemLayoutA, 8,
+      ElementB, GmemLayoutB, 8,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Row
+TEST(SM100_Device_Gemm_f16n_f16t_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 16,
+      ElementD, GmemLayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, GmemLayoutA, 8,
+      ElementB, GmemLayoutB, 8,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Row B Row
+TEST(SM100_Device_Gemm_f16t_f16t_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 16,
+      ElementD, GmemLayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, GmemLayoutA, 8,
+      ElementB, GmemLayoutB, 8,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Col
+TEST(SM100_Device_Gemm_f16n_f16n_f32t_tensorop_2sm_f32, 512x512x128_4x4x1) {
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 16,
+      ElementD, GmemLayoutC, 16,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, GmemLayoutA, 8,
+      ElementB, GmemLayoutB, 8,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_bf16t_bf16t_bf32_void_f32n_tensor_op, 128x256x64_1x2x1) {
+  using ElementA = cutlass::bfloat16_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::bfloat16_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using MmaTileShape = Shape<_128,_128,_64>;
+  using TileShape_MNK = Shape<_128,_256,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+  using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      float, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      MmaTileShape, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f16_ptr_array.cu
@ -0,0 +1,364 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_1sm_f16_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_2sm_f16_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16n_f16n_tensor_op_2sm_f16_ptr_array, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = cutlass::half_t;                                  // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Grouped GEMM interface
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16n_f16t_f16t_tensor_op_1sm_f32_group, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16n_f16n_f16n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16t_f16t_f16t_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16t_f16t_f16n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f16t_f16t_f16t_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
@ -0,0 +1,665 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16n_f16n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16n_f16t_f16t_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16n_f16n_f16n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16t_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f16t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = cutlass::half_t;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_stream_k.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f16_tensor_op_f32_stream_k.cu
@ -0,0 +1,250 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with stream-K scheduling
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 128x256x64_1x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_256,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+  using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      MmaTileShape, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
+  bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 256x128x64_2x1x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_128,_64>;
+  using ClusterShape_MNK = Shape<_2,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+  using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      MmaTileShape, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
+  bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_gmma_f32_stream_k, 256x256x64_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_256,_64>;
+  using ClusterShape_MNK = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+  using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      MmaTileShape, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
+  bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_gmma_f32_stream_k, 256x128x64_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_256,_256,_64>;
+  using ClusterShape_MNK = Shape<_2,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+  using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+  
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      MmaTileShape, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::StreamKScheduler
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using Testbed = Testbed3x<Gemm, cutlass::epilogue::thread::Identity>;
+  bool result = TestSmall<Gemm, false /*force_legacy_epilogue*/, false /*apply_alignment_offset*/>(1.0, 0.0, CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED, {64, 1024, 2048});
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32.cu
@ -0,0 +1,104 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_f16t_f16t_f32_void_f16n_tensor_op, 128x256x64_1x2x1) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using TileShape_MNK = Shape<_128,_256,_64>;
+  using ClusterShape_MNK = Shape<_1,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+  using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      void, LayoutC, 8,
+      cutlass::half_t, LayoutC, 8,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::half_t, LayoutA, 8,
+      cutlass::half_t, LayoutB, 8,
+      float,
+      MmaTileShape, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<
+        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f16_f16_f32_tensor_op_f32_ptr_array.cu
@ -0,0 +1,664 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16n_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16n_f32t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                      // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16n_f16t_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16n_f16t_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16t_f16t_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f16n_f16n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_group_gemm.cu
@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Grouped GEMM interface
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f32t_f32n_f32n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f32t_f32n_f32t_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+
+TEST(SM100Only_Device_Gemm_f32t_f32t_f32n_tensor_op_1sm_f32_group, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f32n_f32n_f32n_tensor_op_1sm_f32_group, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f32n_f32t_f32n_tensor_op_1sm_f32_group, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f32t_f32t_f32n_tensor_op_2sm_f32_group, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100Only_Device_Gemm_f32n_f32n_f32n_tensor_op_2sm_f32_group, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC *, AlignmentC,
+    ElementD, LayoutD *, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA *, AlignmentA,
+    ElementB, LayoutB *, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f32_f32_f32_tensor_op_f32_ptr_array.cu
@ -0,0 +1,667 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Ptr-Array GEMM interface
+*/
+
+
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+using namespace cute;
+
+TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 1.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32t_f32n_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32t_f32n_f32t_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::RowMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+
+TEST(SM100_Device_Gemm_f32t_f32t_f32n_tensor_op_1sm_f32_ptr_array, 64x128x64_1x1x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_64,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32n_f32n_f32n_tensor_op_1sm_f32_ptr_array, 128x128x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_128,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32n_f32t_f32n_tensor_op_1sm_f32_ptr_array, 128x64x64_1x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_128,_64,_64>;
+using ClusterShape_MNK = Shape<_1,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_1,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(3.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32t_f32t_f32n_tensor_op_2sm_f32_ptr_array, 256x128x64_2x1x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::RowMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_128,_64>;
+using ClusterShape_MNK = Shape<_2,_1,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM100_Device_Gemm_f32n_f32n_f32n_tensor_op_2sm_f32_ptr_array, 256x256x64_2x2x1) {
+// A matrix configuration
+using         ElementA    = float;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::ColumnMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+// B matrix configuration
+using         ElementB    = float;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+// C matrix configuration
+using         ElementC    = float;                                // Element type for C matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+// D matrix configuration
+using         ElementD    = float;                                // Element type for D matrix operands
+using         LayoutD     = cutlass::layout::ColumnMajor;                   // Layout type for D matrix operands
+constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;    // Memory access granularity/alignment of D matrix in units of elements (up to 16 bytes)
+// Core kernel configurations
+using ElementAccumulator  = float;                                           // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm100;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                  // Operator class tag
+using TileShape_MNK = Shape<_256,_256,_64>;
+using ClusterShape_MNK = Shape<_2,_2,_1>;
+using AtomThrShape = decltype(shape_div(ClusterShape_MNK{}, Shape<_2,_1,_1>{}));
+using OutputCtaShape = decltype(shape_div(TileShape_MNK{}, ClusterShape_MNK{})); 
+using MmaTileShape = decltype(shape_div(TileShape_MNK{}, AtomThrShape{}));
+using KernelSchedule   = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;   // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;          // Epilogue to launch
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+    OutputCtaShape, ClusterShape_MNK,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    MmaTileShape, ClusterShape_MNK,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestSmall<Gemm>(2.0, 2.0);
+  EXPECT_TRUE(result);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_group_gemm.cu
@ -0,0 +1,327 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 512x256x256_4x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_256,_256>;
+  using ClusterShape = Shape<_4,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementC),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA *, 32,
+      MmaTypePairB, LayoutB *, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.0);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x384x256_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_384,_256>;
+  using ClusterShape = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 4,
+      ElementD, LayoutC *, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA *, 32,
+      MmaTypePairB, LayoutB *, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_group, 256x512x256_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using ClusterShape = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 4,
+      ElementD, LayoutC *, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA *, 32,
+      MmaTypePairB, LayoutB *, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 256x256x256_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using ClusterShape = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 4,
+      ElementD, LayoutC *, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA *, 32,
+      MmaTypePairB, LayoutB *, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_group, 512x768x256_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 4,
+      ElementD, LayoutC *, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA *, 32,
+      MmaTypePairB, LayoutB *, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_ptr_array.cu
@ -0,0 +1,327 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 512x256x256_4x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_256,_256>;
+  using ClusterShape = Shape<_4,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 32,
+      MmaTypePairB, LayoutB, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x384x256_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_384,_256>;
+  using ClusterShape = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 32,
+      MmaTypePairB, LayoutB, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using ClusterShape = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 32,
+      MmaTypePairB, LayoutB, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using ClusterShape = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 32,
+      MmaTypePairB, LayoutB, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e2m1_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 32,
+      MmaTypePairB, LayoutB, 32,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_runtime_datatype.cu
+++ b/test/unit/gemm/device/sm100_gemm_f4_f4_f32_tensor_op_f32_runtime_datatype.cu
@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32t_tensorop_2sm_f32_runtime_datatype, 512x512x128_4x4x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_4,cute::_4,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, cutlass::layout::RowMajor, 4,
+      float, cutlass::layout::RowMajor, 4,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        float,
+        float,
+        float,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float4_t, cutlass::layout::RowMajor, 128,
+      cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
+      float,
+      cute::Shape<cute::_256, cute::_128, cute::_128>,
+      cute::Shape<cute::_4,cute::_4,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E2M1, cute::UMMA::MXF8F6F4Format::E2M1);
+  EXPECT_TRUE(pass);
+
+}
+
+
+TEST(SM100_Device_Gemm_e2m1t_e2m1n_f32t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, cutlass::layout::RowMajor, 4,
+      float, cutlass::layout::RowMajor, 4,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        float,
+        float,
+        float,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float4_t, cutlass::layout::RowMajor, 128,
+      cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
+      float,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E2M1, cute::UMMA::MXF8F6F4Format::E2M1);
+  EXPECT_TRUE(pass);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu
@ -0,0 +1,486 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 128x128x256_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 256x512x256_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_512,_256>;
+  using ClusterShape = Shape<_2,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_1sm_f32_ptr_array, 512x768x256_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_1sm_f32_ptr_array, 512x1024x256_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_1024,_256>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 256x256x256_2x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_256,_256,_256>;
+  using ClusterShape = Shape<_2,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x512x256_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_512,_256>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32n_tensorop_2sm_f32_ptr_array, 512x768x256_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_768,_256>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e2m3t_e2m3n_f32t_tensorop_2sm_f32_ptr_array, 512x1024x256_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e2m3_t;
+  using ElementB = cutlass::float_e2m3_t;
+  using ElementC = float;
+  using ElementD = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = cute::tuple<ElementA, ElementSF>;
+  using MmaTypePairB = cute::tuple<ElementB, ElementSF>;
+
+  using ClusterTileShape = cute::Shape<_512,_1024,_256>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 4,
+      ElementD, LayoutC, 4,
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, LayoutA, 128,
+      MmaTypePairB, LayoutB, 128,
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm, true>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_runtime_datatype.cu
+++ b/test/unit/gemm/device/sm100_gemm_f6_f6_f32_tensor_op_f32_runtime_datatype.cu
@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e3m2t_e2m3n_f32t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, cutlass::layout::RowMajor, 4,
+      float, cutlass::layout::RowMajor, 4,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        float,
+        float,
+        float,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float6_t, cutlass::layout::RowMajor, 128,
+      cutlass::type_erased_dynamic_float6_t, cutlass::layout::ColumnMajor, 128,
+      float,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E3M2, cute::UMMA::MXF8F6F4Format::E2M3);
+  EXPECT_TRUE(pass);
+
+}
+
+TEST(SM100_Device_Gemm_e3m2t_e2m3n_f32t_tensorop_1sm_f32_runtime_datatype, 512x512x128_4x4x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_4,cute::_4,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, cutlass::layout::RowMajor, 4,
+      float, cutlass::layout::RowMajor, 4,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        float,
+        float,
+        float,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float6_t, cutlass::layout::RowMajor, 128,
+      cutlass::type_erased_dynamic_float6_t, cutlass::layout::ColumnMajor, 128,
+      float,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_4,cute::_4,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E3M2, cute::UMMA::MXF8F6F4Format::E2M3);
+  EXPECT_TRUE(pass);
+
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f8_f4_f32_tensor_op_f32_runtime_datatype.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f4_f32_tensor_op_f32_runtime_datatype.cu
@ -0,0 +1,109 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e4m3t_e2m1n_f32t_tensorop_2sm_f32_runtime_datatype, 256x128x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_256, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_1,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, cutlass::layout::RowMajor, 4,
+      float, cutlass::layout::RowMajor, 4,
+      cutlass::epilogue::TmaWarpSpecialized2Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        float,
+        float,
+        float,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
+      cutlass::type_erased_dynamic_float4_t, cutlass::layout::ColumnMajor, 128,
+      float,
+      cute::Shape<cute::_256, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_1,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E2M1);
+  EXPECT_TRUE(pass);
+
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_group_gemm.cu
@ -0,0 +1,504 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide Grouped GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 256x128x128_2x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_2,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_group, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_group, 64x128x128_1x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_group, 256x128x128_2x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_2,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1_silu) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC *, 16 / sizeof(ElementC),
+      ElementD, LayoutC *, 16 / sizeof(ElementD),
+      EpilogueSchedule,
+      cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::SiLu, ElementD, ElementAccumulator>
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(2.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1_voidC_silu) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      void, LayoutD *, 16 / sizeof(ElementD),
+      ElementD, LayoutD *, 16 / sizeof(ElementD),
+      EpilogueSchedule,
+      cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::SiLu, ElementD, ElementAccumulator>
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA *, 16 / sizeof(ElementA),
+      ElementB, LayoutB *, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(2.0, 0.0);
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_ptr_array.cu
@ -0,0 +1,465 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////// 128x128x128 //////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_1x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128_2x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_2,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_2sm_f32_ptr_array, 512x512x128_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3n_e4m3t_e4m3n_tensorop_1sm_f32_ptr_array, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3n_e4m3n_e4m3n_tensorop_1sm_f32_ptr_array, 64x128x128_1x2x1) {
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_64,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_2,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3n_tensorop_2sm_f32_ptr_array, 256x128x128_2x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_256,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_2,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3t_e4m3t_tensorop_2sm_f32_ptr_array, 512x512x128_4x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_512,_512,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_4,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0.5);
+  EXPECT_TRUE(pass);
+}
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_runtime_datatype.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_runtime_datatype.cu
@ -0,0 +1,297 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+TEST(SM100_Device_Gemm_e5m2t_e4m3n_e4m3t_tensorop_2sm_f32_runtime_datatype, 256x128x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_256, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_1,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::epilogue::TmaWarpSpecialized2Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        cutlass::float_e4m3_t,
+        float,
+        cutlass::float_e4m3_t,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
+      float,
+      cute::Shape<cute::_256, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_1,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E4M3);
+  EXPECT_TRUE(pass);
+
+}
+
+TEST(SM100_Device_Gemm_e5m2t_e4m3n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        cutlass::float_e4m3_t,
+        float,
+        cutlass::float_e4m3_t,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
+      float,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E4M3);
+  EXPECT_TRUE(pass);
+
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e5m2n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        cutlass::float_e4m3_t,
+        float,
+        cutlass::float_e4m3_t,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
+      float,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E5M2);
+  EXPECT_TRUE(pass);
+
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_1sm_f32_runtime_datatype, 256x256x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::float_e4m3_t, cutlass::layout::RowMajor, 16,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        cutlass::float_e4m3_t,
+        float,
+        cutlass::float_e4m3_t,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
+      float,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized1SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E4M3, cute::UMMA::MXF8F6F4Format::E4M3);
+  EXPECT_TRUE(pass);
+
+}
+
+TEST(SM100_Device_Gemm_e5m2t_e5m2n_e5m2t_tensorop_2sm_f32_runtime_datatype, 256x256x128_2x2x1) {
+  using CollectiveEpilogue =
+    typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cute::Shape<cute::_128, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      cutlass::float_e5m2_t, cutlass::layout::RowMajor, 16,
+      cutlass::float_e5m2_t, cutlass::layout::RowMajor, 16,
+      cutlass::epilogue::TmaWarpSpecialized1Sm,
+      
+      cutlass::epilogue::fusion::LinearCombination<
+        cutlass::float_e5m2_t,
+        float,
+        cutlass::float_e5m2_t,
+        float
+      >
+
+    >::CollectiveOp;
+
+  using CollectiveMainloop =
+    typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::RowMajor, 16,
+      cutlass::type_erased_dynamic_float8_t, cutlass::layout::ColumnMajor, 16,
+      float,
+      cute::Shape<cute::_256, cute::_128, cute::_128>,
+      cute::Shape<cute::_2,cute::_2,cute::_1>,
+      cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>,
+    cutlass::gemm::KernelTmaWarpSpecialized2SmSm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  auto pass = TestRuntimeDataTypeSmall<Gemm>(cute::UMMA::MXF8F6F4Format::E5M2, cute::UMMA::MXF8F6F4Format::E5M2);
+  EXPECT_TRUE(pass);
+
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_s32_batch_alpha_beta.cu
+++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_s32_batch_alpha_beta.cu
@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////// Test Batch alpha and beta //////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1cta_s32_batch_alpha_beta, 128x64x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
+    ElementD,
+    ElementCompute,
+    ElementC,
+    ElementBias
+  >;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm, false, true, true>(1.0, 1.0); // beta is [1.0, 2.0]
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_bias_relu_batch_alpha_beta, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm, false, false, true>(1.0, 0.5); // beta is [0.5, 1.5]
+  EXPECT_TRUE(pass);
+}
+
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3n_tensorop_1sm_f32_bias_relu__batch_alpha_beta0, 128x128x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBias = cutlass::half_t;
+  using ClusterTileShape = cute::Shape<_128,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
+  using FusionOperation = cutlass::epilogue::fusion::ScaledLinCombPerRowBiasEltAct<
+      cutlass::epilogue::thread::ReLU, ElementD, ElementCompute, ElementBias>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule,
+      FusionOperation
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm, false, false, true>(1.0, -1.0); // beta is [-1.0, 0.0]
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
+++ b/test/unit/gemm/device/sm100_gemm_i8_i8_i8_tensor_op_s32_ptr_array.cu
@ -0,0 +1,284 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////// 128x64x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 128x64x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using ElementC = int8_t;
+  using ElementD = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+  using ElementBias = int8_t;
+  using ClusterTileShape = cute::Shape<_128,_64,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
+  EXPECT_TRUE(pass);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////// 128x64x128 Cluster4x2x1 TMEM 4x1 ////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_1cta_s32_ptr_array, 512x128x128_4x2x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using ElementC = int8_t;
+  using ElementD = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+  using ElementBias = int8_t;
+  using ClusterTileShape = Shape<_512,_128,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_4,_2,_1>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{})); 
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
+  EXPECT_TRUE(pass);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////// 64x256x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Gemm_s8t_s8n_s32n_tensorop_1cta_s32_ptr_array, 64x256x128_1x1x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using ElementC = int32_t;
+  using ElementD = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+  using ElementBias = int32_t;
+  using ClusterTileShape = cute::Shape<_64,_256,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_1,_1,_1>{}));
+
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
+  EXPECT_TRUE(pass);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////// 64x256x128 Cluster2x4x1 TMEM 2x2 ////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4x1) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using ElementC = int8_t;
+  using ElementD = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+  using ElementBias = int8_t;
+  using ClusterTileShape = Shape<_128,_1024,Int<128 / sizeof(ElementA)>>;
+  using ClusterShape = Shape<_2,_4,_1>;
+  using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<_2,_1,_1>{}));
+
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+  using MmaTileShape = decltype(shape_div(ClusterTileShape{}, AtomThrShape{}));
+
+  using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, LayoutC, 16 / sizeof(ElementC),
+      ElementD, LayoutC, 16 / sizeof(ElementD),
+      EpilogueSchedule
+    >::CollectiveOp;
+
+  using MainloopSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      ElementA, LayoutA, 16 / sizeof(ElementA),
+      ElementB, LayoutB, 16 / sizeof(ElementB),
+      ElementAccumulator,
+      MmaTileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::ArrayProblemShape<Shape<int,int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = TestSmall<Gemm>(2, 0.5, CheckEquality::EXACT);
+  EXPECT_TRUE(pass);
+}
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
--- a/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf4_mxf8_mxf8_tensor_op_f32_group_gemm.cu
@ -0,0 +1,293 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+/// A Row B Col
+TEST(SM100Only_Device_Gemm_e2m1t_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 128,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Row
+TEST(SM100Only_Device_Gemm_e2m1n_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 128,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Row B Row
+TEST(SM100Only_Device_Gemm_e2m1t_e4m3t_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 128,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestAll<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Col
+TEST(SM100Only_Device_Gemm_e2m1n_e4m3n_e4m3t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e2m1_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = cutlass::float_e4m3_t;
+  using ElementD = cutlass::float_e4m3_t;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 128,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestAll<Gemm>(1.0, 2.0);
+  EXPECT_TRUE(pass);
+}
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_auto.cu
@ -0,0 +1,281 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+/// A Row B Col
+TEST(SM100_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 4,
+      ElementD, GmemLayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA, 16,
+      ElementB, GmemLayoutB, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Row
+TEST(SM100_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 4,
+      ElementD, GmemLayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA, 16,
+      ElementB, GmemLayoutB, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Row B Row
+TEST(SM100_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 4,
+      ElementD, GmemLayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA, 16,
+      ElementB, GmemLayoutB, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Col
+TEST(SM100_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_auto, 512x512x128_4x4x1) {
+  using ElementA = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementB = cutlass::mx_float8_t<cutlass::float_e4m3_t>;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC, 4,
+      ElementD, GmemLayoutC, 4,
+      cutlass::epilogue::collective::EpilogueScheduleAuto
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      ElementA, GmemLayoutA, 16,
+      ElementB, GmemLayoutB, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmallFusion<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm100_gemm_mxf8_mxf8_mxf8_tensor_op_f32_group_gemm.cu
@ -0,0 +1,293 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "gemm_testbed_3x_ptr_array.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+/// A Row B Col
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 16,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Row
+TEST(SM100Only_Device_Gemm_e4m3n_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 16,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Row B Row
+TEST(SM100Only_Device_Gemm_e4m3t_e4m3t_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::RowMajor;
+  using GmemLayoutB = cutlass::layout::RowMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 16,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+
+/// A Col B Col
+TEST(SM100Only_Device_Gemm_e4m3n_e4m3n_f32t_tensorop_2sm_f32_group, 512x512x128_4x4x1) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = float;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+  using ElementSF = cutlass::float_ue8m0_t;
+  using MmaTypePairA = decltype(cute::make_tuple(ElementA{}, ElementSF{}));
+  using MmaTypePairB = decltype(cute::make_tuple(ElementB{}, ElementSF{}));
+  using ElementAccumulator = float;
+  using GmemLayoutA = cutlass::layout::ColumnMajor;
+  using GmemLayoutB = cutlass::layout::ColumnMajor;
+  using GmemLayoutC = cutlass::layout::RowMajor;
+  using ClusterTileShape_MNK = Shape<_512,_512,_128>;
+  using ClusterShape_MNK = Shape<_4,_4,_1>;
+  using MmaTileShape_MNK = Shape<_256,_128,_128>;
+  using OutputCtaShape = decltype(shape_div(ClusterTileShape_MNK{}, ClusterShape_MNK{})); 
+  
+  //
+  // Construct CollectiveEpilogue
+  //
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      OutputCtaShape, ClusterShape_MNK,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator, ElementCompute,
+      ElementC, GmemLayoutC *, 16,
+      ElementD, GmemLayoutC *, 16,
+      cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm
+    >::CollectiveOp;
+
+  //
+  // Construct CollectiveMainloop
+  //
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassBlockScaledTensorOp,
+      MmaTypePairA, GmemLayoutA *, 16,
+      MmaTypePairB, GmemLayoutB *, 16,
+      ElementAccumulator,
+      MmaTileShape_MNK, ClusterShape_MNK,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      cutlass::gemm::GroupProblemShape<Shape<int,int,int>>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+  
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  auto pass = test::gemm::device::TestSmall<Gemm>(1.0, 0);
+  EXPECT_TRUE(pass);
+}
+#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
--- a/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
@ -87,6 +87,7 @@ TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align1, 6

 /////////////////////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 128x128x32_64x64x32) {

  using ElementOutput = float;
@ -124,6 +125,8 @@ TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 1

  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
 }
+#endif 
+
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/gemm/threadblock/mma_multistage.cu
+++ b/test/unit/gemm/threadblock/mma_multistage.cu
@ -2974,6 +2974,7 @@ TEST(SM80_gemm_threadblock_crosswise,
 }

 ////////////////////////////////////////////////////////////////////////////////
+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_64x64x1024_64x64x1024_16x8x256_3stage) {
  using ElementA = cutlass::uint1b_t;
@ -3006,8 +3007,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_64x64x1024_32x32x1024_16x8x256_3stage) {
  using ElementA = cutlass::uint1b_t;
@ -3040,8 +3044,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_128x64x1024_64x32x1024_16x8x256_3stage) {
  using ElementA = cutlass::uint1b_t;
@ -3074,8 +3081,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_64x1024x1024_32x64x1024_16x8x256_3stage) {
  using ElementA = cutlass::uint1b_t;
@ -3108,8 +3118,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_128x1024x1024_64x64x1024_16x8x256_3stage) {
  using ElementA = cutlass::uint1b_t;
@ -3142,8 +3155,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     multicta_256x256x6144_128x1024x1024_64x64x1024_16x8x256_3stage) {
  using ElementA = cutlass::uint1b_t;
@ -3176,8 +3192,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     multicta_512x256x6144_256x1024x1024_64x64x1024_16x8x256_3stage) {
  using ElementA = cutlass::uint1b_t;
@ -3210,8 +3229,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_64x64x512_64x64x512_16x8x256_4stage) {
  using ElementA = cutlass::uint1b_t;
@ -3244,8 +3266,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_64x64x512_32x32x512_16x8x256_4stage) {
  using ElementA = cutlass::uint1b_t;
@ -3278,8 +3303,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_128x64x512_64x32x512_16x8x256_4stage) {
  using ElementA = cutlass::uint1b_t;
@ -3312,8 +3340,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_64x128x512_32x64x512_16x8x256_4stage) {
  using ElementA = cutlass::uint1b_t;
@ -3346,8 +3377,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     tensor_op_128x128x512_64x64x512_16x8x256_4stage) {
  using ElementA = cutlass::uint1b_t;
@ -3380,8 +3414,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     multicta_256x256x6144_128x128x512_64x64x512_16x8x256_4stage) {
  using ElementA = cutlass::uint1b_t;
@ -3414,8 +3451,11 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////

+#if 0 
 TEST(SM80_gemm_threadblock_crosswise,
     multicta_512x256x6144_256x128x512_64x64x512_16x8x256_4stage) {
  using ElementA = cutlass::uint1b_t;
@ -3448,6 +3488,8 @@ TEST(SM80_gemm_threadblock_crosswise,
                                            problem_size.k(), alpha, beta)
      .run(grid, block);
 }
+#endif 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_gemm_threadblock_congruous,
     tensor_op_64x64x16_32x64x16_8x8x4_3stage) {
--- a/test/unit/pipeline/CMakeLists.txt
+++ b/test/unit/pipeline/CMakeLists.txt
@ -31,6 +31,7 @@ cutlass_test_unit_add_executable(
  pipeline_tma_async.cu
  pipeline_tma_async_warp_specialized.cu
  pipeline_tma_async_warp_specialized_persistent.cu
+  pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu 
  pipeline_async.cu
  sequence_barrier.cu
 )
--- a/test/unit/pipeline/pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
+++ b/test/unit/pipeline/pipeline_cluster_launch_control_async_warp_specialized_blackwell.cu
@ -0,0 +1,381 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit test for the PipelineCLCFetchAsync class
+*/
+
+//
+
+//
+
+#define KERNEL_DBG_TRACE false
+
+#include <cuda/atomic>
+#include "../common/cutlass_unit_test.h"
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp>
+
+#include <cutlass/util/reference/host/gemm.h>
+#include <cutlass/cluster_launch.hpp>
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/print_error.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "testbed_cluster_launch_control.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+
+
+using namespace cute;
+using namespace cutlass;
+using namespace cutlass::gemm::kernel::detail;
+
+//////////////////// Shared Memory  /////////////////////////
+
+template <uint32_t Stages, typename ClusterShape>
+struct SharedStorage
+{
+  alignas(16) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::CLCResponse clc_response[Stages];
+  alignas(8) typename PersistentTileSchedulerSm100<ClusterShape, Stages>::PipelineStorage storage ;
+};
+
+//////////////////// Kernel /////////////////////////
+template <typename ClusterShape, uint32_t Stages>
+__launch_bounds__(256, 1)
+__global__ static
+void pipeline_device(int *d_workerCount)
+{
+  extern __shared__ char shared_memory[];
+
+  // single producer, multiple consumers
+  // producer: WG0
+  // consumer: WG1
+
+  using SharedStorage = SharedStorage<Stages, ClusterShape>;
+  using Scheduler = PersistentTileSchedulerSm100<ClusterShape, Stages>;
+  using TileSchedulingPipeline = typename Scheduler::Pipeline;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Logistics
+  int warp_idx = canonical_warp_idx();
+  auto cluster_shape = ClusterShape{};
+
+  typename TileSchedulingPipeline::Params params;
+  params.transaction_bytes = 16;
+
+  constexpr int NUM_PRODUCER = 32;
+  constexpr int NUM_CONSUMERS_PER_CTA = 32;
+  params.consumer_arv_count = NUM_PRODUCER + NUM_CONSUMERS_PER_CTA * cute::size<0>(cluster_shape) * cute::size<1>(cluster_shape);
+  params.producer_arv_count = 1;
+  // Only the first CTA in the Cluster is producing.
+  params.producer_blockid = 0;
+
+  dim3 block_id_in_cluster = cute::block_id_in_cluster();
+  // mbarrier.init
+  TileSchedulingPipeline scheduler_pipeline(shared_storage.storage, params );
+  Scheduler scheduler(&shared_storage.clc_response[0], typename Scheduler::Params{}, block_id_in_cluster);
+
+  // Ensure All CTAs in Cluster have completed init before issuing commits
+  cute::cluster_arrive_relaxed();
+  cute::cluster_wait();
+
+  uint32_t is_first_block_in_cluster = block_id_in_cluster.x == 0 && block_id_in_cluster.y == 0;
+  int lane_predicate = cute::elect_one_sync();
+
+  uint32_t is_producer = (is_first_block_in_cluster && warp_idx == 0);
+  uint32_t is_consumer = (warp_idx == 4);
+
+  PipelineState<Stages> scheduler_pipe_state;
+  PipelineState<Stages> scheduler_pipe_state_write = cutlass::make_producer_start_state<TileSchedulingPipeline>();
+  typename Scheduler::WorkTileInfo work_tile_info = {
+    static_cast<int32_t>(blockIdx.x),
+    static_cast<int32_t>(blockIdx.y),
+    static_cast<int32_t>(blockIdx.z),
+    false
+  };
+
+  // Persistent loop
+  do {
+    // Producer
+    if (is_producer) {
+      // Only 1 thread of the entire cluster issues the query.
+      scheduler_pipe_state_write = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_state_write);
+    }
+
+    // Consumers
+    if (is_consumer) {
+      int linearCLC = work_tile_info.N_idx * gridDim.x + work_tile_info.M_idx;
+      // Atomically increment the worker count for the linearCLC by 1.
+      if (lane_predicate) {
+        atomicAdd(&d_workerCount[linearCLC], 1);
+      }
+    }
+
+    // Union of all consumers. Note that the producer here is its own consumer.
+    if (is_producer || is_consumer) {
+      scheduler_pipeline.consumer_wait(scheduler_pipe_state);
+      work_tile_info = scheduler.get_current_work(scheduler_pipe_state);
+      scheduler_pipeline.consumer_release(scheduler_pipe_state);
+      ++scheduler_pipe_state;
+
+      // Add block offset since the scheduler works at cluster level. 
+      dim3 block_id_in_cluster = cute::block_id_in_cluster();
+      work_tile_info.M_idx += block_id_in_cluster.x;
+      work_tile_info.N_idx += block_id_in_cluster.y;
+      work_tile_info.L_idx += block_id_in_cluster.z;
+
+    }
+  } while (work_tile_info.is_valid_tile);
+
+  // End of kernel
+  cute::cluster_sync();
+}
+/////////////////////////////////////////////////////
+
+template<uint32_t Stages_, typename ClusterShape_>
+struct PipelineTest {
+
+  //
+  // Data members
+  //
+  static constexpr uint32_t Stages = Stages_;
+  static constexpr uint32_t BlockSize = 128 * 2;
+  using ClusterShape = ClusterShape_;
+
+  //
+  // Methods
+  //
+
+  bool check_results(int *h_workerCount, int size ) {
+    for (int i = 0 ; i< size; i++ ){
+      if ( h_workerCount[i] != 1 )
+      {
+        std::cout << "linearCLC " << i << " has worker count " << h_workerCount[i] << "\n";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Run CuTe GEMM kernel
+  cudaError_t run(bool &success, dim3 grid_dim,
+                  cudaStream_t stream = 0 ) {
+
+    //
+    // Configure and launch
+    //
+    cudaError_t result;
+
+    int smem_size = 192 * 1024;  // 192kB to force 1CTA/SM
+    auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
+    // Launch a single Cluster, with BlockSize threads per CTA
+    dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), 1);
+    dim3 dimGrid = grid_dim;
+    dim3 dimBlock(BlockSize,1,1);
+
+    result = cudaFuncSetAttribute(
+                  pipeline_device<
+                    decltype(cluster_shape),
+                    Stages>,
+                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                    smem_size
+                  );
+
+    if (result != cudaSuccess) {
+      std::cerr << "Error: Failed to set Shared Memory size." << std::endl;
+      return result;
+    }
+
+    int array_size = dimGrid.x * dimGrid.y;
+    int *d_workerCount, *h_workerCount;
+
+    /* Allocate memory. workerCount[i] counts the number of worker(s) which work
+       on linear t i.  The expectation is that workerCount[i] == 1 for all i.
+    */
+    h_workerCount = (int*)malloc(array_size * sizeof(int));
+
+    result = cudaMalloc(&d_workerCount, array_size * sizeof(int));
+    if (result != cudaSuccess) {
+      std::cerr << "Failed to do cudaMalloc." << result << "\n";
+      return result;
+    }
+
+    for(int i = 0 ; i < array_size; i++)
+    {
+      h_workerCount[i] = 0;  // Initialize workerCount[i] to 0 for all i.
+    }
+
+    result = cudaMemcpy(d_workerCount, h_workerCount, array_size * sizeof(int), cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      std::cerr << "Failed to do cudaMemcpy." << result << "\n";
+      return result;
+    }
+
+    //  Extended launch API
+    const void* kernel = (const void*)pipeline_device<decltype(cluster_shape), Stages>;
+    void* kernel_params[] = {&d_workerCount};
+    cutlass::ClusterLauncher::launch(dimGrid, dimCluster, dimBlock, smem_size, stream, kernel, kernel_params);
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Error: cudaDeviceSynchronize() failed" << std::endl;
+      return result;
+    }
+
+    result = cudaMemcpy(h_workerCount, d_workerCount, array_size * sizeof(int), cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      std::cerr << "Failed to do cudaMemcpy." << result << "\n";
+      return result;
+    }
+
+    success = check_results(h_workerCount, array_size);
+
+    free(h_workerCount);
+
+    result = cudaFree(d_workerCount);
+    if (result != cudaSuccess) {
+      std::cerr << "Failed to do cudaFree." << result << "\n";
+      return result;
+    }
+
+    return cudaSuccess;
+  }
+};
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+//Cluster1x2 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x2_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<1, 2, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster2x1 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x1_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<2, 1, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster2x2 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x2_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<2, 2, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster1x1 Stage3
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x1_Stage3) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr uint32_t Stages = 3;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster1x4 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster1x4_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<1, 4, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster4x1 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x1_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<4, 1, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster2x4 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster2x4_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<2, 4, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster4x2 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x2_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<4, 2, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+
+//Cluster4x4 Stage4
+TEST(SM100_Verify_PipelineClusterLaunchControlAsync_WS, Cluster4x4_Stage4) {
+  Options options;
+  options.grid_dim = {32,32,1};
+  using ClusterShape = cutlass::gemm::GemmShape<4, 4, 1>;
+  static constexpr uint32_t Stages = 4;
+  using Test = PipelineTest<Stages, ClusterShape>;
+  Testbed<Test> testbed(options);
+  EXPECT_TRUE(testbed.verification());
+}
+#endif
--- a/test/unit/pipeline/testbed_cluster_launch_control.h
+++ b/test/unit/pipeline/testbed_cluster_launch_control.h
@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Testbed file used by cluster launch control pipeline unit test
+*/
+
+//
+
+//
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+  #define CUTLASS_UNIT_TEST_PIPELINE true
+#else
+  #define CUTLASS_UNIT_TEST_PIPELINE false
+#endif
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+#include <cutlass/gemm/gemm.h>
+
+#include "cutlass/util/command_line.h"
+
+// Command line test options
+struct Options {
+  //
+  // Data Members
+  // 
+  bool help = false;
+  bool verification_enabled = true;
+  int SM_count = 116;
+  int clock_MHz = 1477;
+  dim3 grid_dim = {0,0,0};
+
+  //
+  // Methods
+  // 
+
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("verification-enabled", verification_enabled, verification_enabled);
+    cmd.get_cmd_line_argument("sm-count", SM_count, SM_count);
+    cmd.get_cmd_line_argument("clock", clock_MHz, clock_MHz);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "Options:\n\n"
+      << "  --help                          If specified, displays this usage statement.\n\n"
+      << "  --verification-enabled=<bool>   Enable/Disable verification\n"
+      << "  --sm-count=<int>                Number of SMs on the chip\n"
+      << "  --clock=<int>                   Locked clock value in Mhz\n";
+
+    return out;
+  }
+};
+
+//
+// Testbed
+//
+
+template<typename Pipeline>
+class Testbed {
+private:
+  // Commandline options
+  Options options;
+
+  bool run_test() {
+
+    // Run CuTe Gemm 
+    Pipeline pipeline;
+
+    bool success = false;
+    cudaError_t result = pipeline.run(success, this->options.grid_dim);
+    
+    CUTE_CHECK_LAST();
+    return success;
+  }
+
+
+public:
+  Testbed(Options const &options_) : options(options_) {
+    int device_id = 0;
+    cudaDeviceProp device_prop;
+    CUTE_CHECK_ERROR(cudaSetDevice(device_id));
+    CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
+  
+    if (device_prop.major < 1) {
+      fprintf(stderr, "Device does not support CUDA.\n");
+      exit(1);
+    }
+  }
+
+  /// Run verification Gemm problem sizes
+  bool verification() {
+
+#if !defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+  printf(
+    "CUTLASS_ARCH_MMA_SM100_SUPPORTED must be set, but it is not. \n"
+    "This test is waived.\n"
+  );
+  return true;
+#endif
+
+#if 1
+    bool is_success = false;
+    for (int i = 0; i< 10; i++){
+      printf("iteration = %d\n", i);
+      is_success = run_test();
+      if ( not is_success )
+        return is_success;
+    }
+    return is_success;
+#else
+    // Run the test with single launch
+    return run_test();
+#endif
+  }
+};