Updates for 3.4 release. (#1305)

2024-01-16 10:42:51 -08:00
parent acba5beee5
commit 2f589ffa76
166 changed files with 5996 additions and 4702 deletions
--- a/test/unit/cute/core/bitfield.cpp
+++ b/test/unit/cute/core/bitfield.cpp
@ -38,9 +38,10 @@
 #include <vector>
 #include <numeric>

-#include <cute/tensor.hpp>
 #include <cute/container/bit_field.hpp>

+#include <cute/algorithm/tuple_algorithms.hpp>
+
 using namespace cute;

 TEST(CuTe_core, Bitfield)
--- a/test/unit/cute/core/complement.cpp
+++ b/test/unit/cute/core/complement.cpp
@ -43,26 +43,30 @@ test_complement(Layout const& layout, CoSizeHi const& cosize_hi)

  auto result = complement(layout, cosize_hi);

-  CUTLASS_TRACE_HOST("complement( " << layout << ", " << cosize_hi << ")  =>  " << result);
+  CUTLASS_TRACE_HOST("complement(" << layout << ", " << cosize_hi << ")  =>  " << result);

-  // Post-condition on the   domain size of the complement (1)
-  EXPECT_GE(  size(result), cosize_hi / size(filter(layout)));
-  // Post-condition on the codomain size of the complement (2)
-  EXPECT_LE(cosize(result), cute::ceil_div(cosize_hi, cosize(layout)) * cosize(layout));
+  auto completed = make_layout(layout, result);
+
+  // Lower-bound on the codomain size of the layout ++ complement (1)
+  EXPECT_GE(cosize(completed), cosize_hi);
+  // Upper-bound on the codomain size of the complement (2)
+  EXPECT_LE(cosize(result), cute::round_up(cosize_hi, cosize(layout)));

  // Post-condition on the codomain of the complement
  for (int i = 1; i < size(result); ++i) {
    EXPECT_LT(result(i-1), result(i));         // Ordered (3)
    for (int j = 0; j < size(layout); ++j) {
-      EXPECT_NE(result(i), layout(j));        // Complemented (4)
+      EXPECT_NE(result(i), layout(j));         // Disjoint (4)
    }
  }

  // Other observations
-  EXPECT_LE(size(result),cosize(result));                      // As a result of the ordered condition (3)
-  EXPECT_GE(cosize(result), cosize_hi / size(filter(layout)));  // As a result of (1) (2) and (5)
-  if constexpr (is_static<decltype(stride(make_layout(layout,result)))>::value) { // If we can apply complement again
-    EXPECT_EQ(size(complement(make_layout(layout,result))), 1);                    // There's no more codomain left over
+  EXPECT_LE(size(result), cosize(result));                        // As a result of the ordered condition (3)
+  EXPECT_GE(size(result), cosize_hi / size(filter(layout)));
+  EXPECT_LE(cosize(completed), cosize(result) + cosize(layout));
+  EXPECT_GE(cosize(result), cosize_hi / size(filter(layout)));            
+  if constexpr (is_static<decltype(stride(completed))>::value) {  // If we can apply complement again
+    EXPECT_EQ(size(complement(completed)), 1);                    // There's no more codomain left over
  }
 }

@ -125,6 +129,7 @@ TEST(CuTe_core, Complement)
  test_complement(layout, Int<1>{});
  test_complement(layout);
  test_complement(layout, Int<16>{});
+  test_complement(layout, Int<19>{});
  }

  {
@ -153,6 +158,12 @@ TEST(CuTe_core, Complement)
  test_complement(layout);
  }

+  {
+  auto layout = Layout<Shape<_2,_4>, Stride<_1,_6>>{};
+
+  test_complement(layout);
+  }
+
  {
  auto layout = Layout<Shape<_2,_4,_8>, Stride<_8,_1,_64>>{};

@ -167,26 +178,34 @@ TEST(CuTe_core, Complement)
  }

  {
-  auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
+  auto layout = make_layout(Shape <Shape <_2,_2>,Shape <_2, _2>>{},
                            Stride<Stride<_1,_4>,Stride<_8,_32>>{});

  test_complement(layout);
  }

  {
-  auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
+  auto layout = make_layout(Shape <Shape <_2, _2>,Shape <_2,_2>>{},
                            Stride<Stride<_1,_32>,Stride<_8,_4>>{});

  test_complement(layout);
  }

-  // Fails due to non-injective input
-  //{
-  //auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
+  // Fails due to non-injective layout
+  // {
+  // auto layout = make_layout(Shape<Shape<_2,_2>,Shape<_2, _2>>{},
  //                          Stride<Stride<_1,_8>,Stride<_8,_4>>{});

-  //test_complement(layout);
-  //}
+  // test_complement(layout);
+  // }
+
+  // Fails due to non-injective layout
+  // {
+  // auto layout = Layout<Shape<_2,_2>, Stride<_2,_3>>{};
+
+  // test_complement(layout);
+  // test_complement(layout, Int<19>{});
+  // }

  {
  auto layout = Layout<Shape<_4,_6>, Stride<_1,_6>>{};
--- a/test/unit/cute/core/composition.cpp
+++ b/test/unit/cute/core/composition.cpp
@ -42,8 +42,8 @@ using namespace cute;

 template <class LayoutA, class LayoutB>
 void
-test_composition(const LayoutA& layoutA,
-                 const LayoutB& layoutB)
+test_composition(LayoutA const& layoutA,
+                 LayoutB const& layoutB)
 {
  auto layoutR = composition(layoutA, layoutB);

@ -52,14 +52,12 @@ test_composition(const LayoutA& layoutA,
  CUTLASS_TRACE_HOST("  =>  ");
  CUTLASS_TRACE_HOST(layoutR);

-  // Test that layout R is compatible with layout B
+  // Test that layout B is compatible with layout R
  EXPECT_TRUE(compatible(layoutB, layoutR));

-  // True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
-
-  // Test that R(c) = A(B(c)) for all coordinates c in layoutR
-  for (int i = 0; i < size(layoutR); ++i) {
-    EXPECT_EQ(layoutR(i), layoutA(layoutB(i)));
+  // Test that R(c) = A(B(c)) for all coordinates c in layoutB
+  for (int c = 0; c < size(layoutB); ++c) {
+    EXPECT_EQ(layoutR(c), layoutA(layoutB(c)));
  }
 }

--- a/test/unit/cute/core/logical_divide.cpp
+++ b/test/unit/cute/core/logical_divide.cpp
@ -45,10 +45,10 @@ test_logical_divide(LayoutA const& layoutA,
  auto layoutR = logical_divide(layoutA, layoutB);

  CUTLASS_TRACE_HOST("test_logical_divide()");
-  CUTLASS_TRACE_HOST(shape(layoutA)  << " / " << shape(layoutB)  << "  =>  " << shape(layoutR) );
+  CUTLASS_TRACE_HOST( shape(layoutA) << " / " <<  shape(layoutB) << "  =>  " <<  shape(layoutR));
  CUTLASS_TRACE_HOST(stride(layoutA) << "   " << stride(layoutB) << "  =>  " << stride(layoutR));

-  // Test that layout R is compatible with layout B
+  // Test that layout B is compatible with layout R_0
  ASSERT_EQ(rank(layoutR), 2);
  ASSERT_TRUE(compatible(layoutB, layout<0>(layoutR)));
 }
@ -186,10 +186,10 @@ TEST(CuTe_core, Logical_divide)

  // Enforcement for dynamic cases
  auto result = logical_divide(layout, tile);
-  static_assert(decltype(shape<0>(result) == Int<32>{})::value);
-  static_assert(decltype(stride<0>(result) == Int<1>{})::value);
-  assert(shape<1>(result) == 1);
-  static_assert(decltype(stride<1>(result) == Int<32>{})::value);
+  ASSERT_TRUE(decltype(shape<0>(result) == Int<32>{})::value);
+  ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
+  ASSERT_TRUE(shape<1>(result) == 1);
+  ASSERT_TRUE(decltype(stride<1>(result) == Int<32>{})::value);
  }

  {
@ -200,10 +200,10 @@ TEST(CuTe_core, Logical_divide)

  // Enforcement for dynamic cases
  auto result = logical_divide(layout, tile);
-  static_assert(decltype(shape<0>(result) == Int<32>{})::value);
-  static_assert(decltype(stride<0>(result) == Int<1>{})::value);
-  assert(shape<1>(result) == 2);
-  static_assert(decltype(stride<1>(result) == Int<32>{})::value);
+  ASSERT_TRUE(decltype(shape<0>(result) == Int<32>{})::value);
+  ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
+  ASSERT_TRUE(shape<1>(result) == 2);
+  ASSERT_TRUE(decltype(stride<1>(result) == Int<32>{})::value);
  }

  {
@ -221,10 +221,10 @@ TEST(CuTe_core, Logical_divide)

  // Enforcement for dynamic cases
  auto result = logical_divide(layout, tile);
-  static_assert(decltype(shape<0>(result) == Int<48>{})::value);
-  static_assert(decltype(stride<0>(result) == Int<1>{})::value);
-  assert(shape<1>(result) == 1);
-  static_assert(decltype(stride<1>(result) == Int<48>{})::value);
+  ASSERT_TRUE(decltype(shape<0>(result) == Int<48>{})::value);
+  ASSERT_TRUE(decltype(stride<0>(result) == Int<1>{})::value);
+  ASSERT_TRUE(shape<1>(result) == 1);
+  ASSERT_TRUE(decltype(stride<1>(result) == Int<48>{})::value);
  }

  // DISALLOWED
--- a/test/unit/cute/core/logical_product.cpp
+++ b/test/unit/cute/core/logical_product.cpp
@ -46,13 +46,9 @@ test_logical_product(LayoutA const& layoutA,
  CUTLASS_TRACE_HOST(shape(layoutA)  << " x " << shape(layoutB)  << "  =>  " << shape(layoutR) );
  CUTLASS_TRACE_HOST(stride(layoutA) << "   " << stride(layoutB) << "  =>  " << stride(layoutR));

-  // Test that layout R is compatible with layout B
  ASSERT_EQ(rank(layoutR), 2);
-  //assert(compatible(layoutB, layout<0>(layoutR)));
-  //assert(consistent(layoutA, layout<1>(layoutR)));
-
-  // True post-condition:
-
+  ASSERT_TRUE(layoutA == layout<0>(layoutR));
+  ASSERT_TRUE(compatible(layoutB, layout<1>(layoutR)));
 }

 TEST(CuTe_core, Logical_product)
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
--- a/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
@ -58,7 +58,7 @@ template <
 class HostEVTNodeBase {
 public:
  using Gemm = Gemm_;
-  using TestBedImpl = typename detail::TestbedImpl<Gemm>;
+  using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
  using Kernel = typename Gemm::GemmKernel;
  using Epilogue = typename Kernel::CollectiveEpilogue;
  using ElementCompute = typename TestBedImpl::ElementCompute;
@ -238,9 +238,9 @@ public:
    _bias.resize(cutlass::Coord<1>(_N));
    
    EXPECT_TRUE(
-      impl_.initialize_tensor(
+      detail::initialize_tensor(
        _bias.host_view(), cutlass::Distribution::Uniform, 
-        impl_.seed + 2023
+        impl_.collective_mma_inputs.seed + 2023
      )
    );
    _bias.sync_device();
@ -306,9 +306,9 @@ public:
    _bias.resize(cutlass::Coord<1>(_M));
    
    EXPECT_TRUE(
-      impl_.initialize_tensor(
+      detail::initialize_tensor(
        _bias.host_view(), cutlass::Distribution::Uniform, 
-        impl_.seed + 2023
+        impl_.collective_mma_inputs.seed + 2023
      )
    );
    _bias.sync_device();
@ -393,10 +393,10 @@ public:
      )
    );
    EXPECT_TRUE(
-      impl_.initialize_tensor(
+      detail::initialize_tensor(
        _tensor_aux_load.host_view(), 
        cutlass::Distribution::Uniform, 
-        impl_.seed + 2023
+        impl_.collective_mma_inputs.seed + 2023
      )
    );
    _tensor_aux_load.sync_device();
@ -1154,7 +1154,7 @@ public:
  // The EVT Module to test
  using EVTModule = typename EVT::EVTModule;

-  using TestBedImpl = typename detail::TestbedImpl<Gemm>;
+  using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
  using Kernel = typename Gemm::GemmKernel;
  using Epilogue = typename Gemm::GemmKernel::CollectiveEpilogue;
  using ElementAccumulator = typename Kernel::ElementAccumulator;
@ -1178,7 +1178,9 @@ public:
    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
    uint64_t seed_ = TestBedImpl::kDefaultSeed
  ) :
-     impl_(init_A_, init_B_, init_C_, seed_), check_relative_equality(check_relative_equality_) { }
+     impl_((check_relative_equality_ ? CheckEquality::RELATIVE : CheckEquality::EXACT), ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
+           init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+           check_relative_equality(check_relative_equality_) { }

  Testbed3xEVT(
    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
@ -1186,7 +1188,9 @@ public:
    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
    uint64_t seed_ = TestBedImpl::kDefaultSeed
  ) :
-     impl_(init_A_, init_B_, init_C_, seed_), check_relative_equality(false)  { }
+     impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
+           init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+           check_relative_equality(false)  { }

  Testbed3xEVT(
    typename LayoutTagA::Stride stride_factor_A_,
@ -1198,15 +1202,10 @@ public:
    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
    uint64_t seed_ = TestBedImpl::kDefaultSeed
  ) :
-    impl_(stride_factor_A_,
-      stride_factor_B_,
-      stride_factor_C_,
-      stride_factor_D_,
-      init_A_,
-      init_B_,
-      init_C_,
-      seed_),
-    check_relative_equality(false)  { }
+    impl_(stride_factor_A_, stride_factor_B_, stride_factor_C_, stride_factor_D_,
+          CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
+          init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+          check_relative_equality(false)  { }
  
  /// Initializes data structures
  void initialize(ProblemShapeType problem_size) {
@ -1229,11 +1228,11 @@ public:
    auto K = cute::get<2>(problem_shape_MNKL);
    auto L = cute::get<3>(problem_shape_MNKL);

-    auto A = cute::make_tensor(impl_.tensor_A.host_data(),
-      cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
-    auto B = cute::make_tensor(impl_.tensor_B.host_data(),
-      cute::make_layout(cute::make_shape(N, K, L), impl_.stride_b));
-    auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.stride_d);
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d);

    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};

@ -1277,9 +1276,9 @@ public:
        << ", Batch count = " << L << "\n\n";
      
      file
-        << "A =\n" << impl_.tensor_A.host_view()
-        << "\nB =\n" << impl_.tensor_B.host_view()
-        << "\nC =\n" << impl_.tensor_C.host_view() << "\n\n";
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
+        << "\nC =\n" << impl_.collective_epilogue.tensor_C.host_view() << "\n\n";
      
      file << error_ss.str();
    }
@ -1329,15 +1328,15 @@ public:
      cutlass::gemm::GemmUniversalMode::kGemm,
      problem_size,
      {
-        impl_.tensor_A.device_data(), impl_.stride_a,
-        impl_.tensor_B.device_data(), impl_.stride_b
+        impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+        impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b
      },
      {   // Epilogue arguments
        {}, // thread
        static_cast<ElementC*>(host_reference.get_tensor_C_ptr()),
-        impl_.stride_c,
+        impl_.collective_epilogue.stride_c,
        static_cast<ElementD*>(host_reference.get_tensor_D_ptr()),
-        impl_.stride_d
+        impl_.collective_epilogue.stride_d
      },  // Epilogue arguments end
      hw_info,
      scheduler_args
--- a/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
@ -101,7 +101,8 @@ struct Testbed3xTensorBroadcast {
    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
    uint64_t seed_ = TestBedImpl::kDefaultSeed
  ) :
-    impl_(init_A_, init_B_, init_C_, seed_) { }
+    impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorBeta::ENABLED,
+          init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_) { }

  Testbed3xTensorBroadcast(
    typename LayoutTagA::Stride stride_factor_A_,
@ -117,9 +118,12 @@ struct Testbed3xTensorBroadcast {
          stride_factor_B_,
          stride_factor_C_,
          stride_factor_D_,
+          CheckEquality::EXACT, ScalarLoc::ON_HOST, VectorBeta::ENABLED,
          init_A_,
          init_B_,
          init_C_,
+          cutlass::Distribution::Uniform,
+          cutlass::Distribution::Uniform,
          seed_) { }

  /// Initializes data structures
@ -135,7 +139,7 @@ struct Testbed3xTensorBroadcast {
    auto bias_size = PerColBias ? cute::get<1>(problem_shape_MNKL) : cute::get<0>(problem_shape_MNKL);
    bias.resize(cutlass::Coord<1>(bias_size));

-    EXPECT_TRUE(impl_.initialize_tensor(bias.host_view(), cutlass::Distribution::Uniform, impl_.seed + 2023));
+    EXPECT_TRUE(detail::initialize_tensor(bias.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2023));
    bias.sync_device();
  }

@ -147,8 +151,8 @@ struct Testbed3xTensorBroadcast {

    auto c_coord = cutlass::make_Coord(M * L, N);

-    tensor_C1.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.stride_factor_C));
-    EXPECT_TRUE(impl_.initialize_tensor(tensor_C1.host_view(), cutlass::Distribution::Uniform, impl_.seed + 2024));
+    tensor_C1.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C));
+    EXPECT_TRUE(detail::initialize_tensor(tensor_C1.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2024));
    tensor_C1.sync_device();
  }

@ -161,19 +165,19 @@ struct Testbed3xTensorBroadcast {
  {
    auto [M, N, K, L] = problem_shape_MNKL;

-    impl_.tensor_D.sync_host();
-    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_A.host_view()), 0);
-    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_B.host_view()), 0);
+    impl_.collective_epilogue.tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_B.host_view()), 0);

-    if (impl_.tensor_D.size() > 1) {
-      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_D.host_view()), 0);
+    if (impl_.collective_epilogue.tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.tensor_D.host_view()), 0);
    }

-    if (impl_.reference_D.size() > 1) {
-      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.reference_D.host_view()), 0);
+    if (impl_.collective_epilogue.reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.reference_D.host_view()), 0);
    }

-    bool passed = cutlass::reference::host::TensorEquals(impl_.reference_D.host_view(), impl_.tensor_D.host_view());
+    bool passed = cutlass::reference::host::TensorEquals(impl_.collective_epilogue.reference_D.host_view(), impl_.collective_epilogue.tensor_D.host_view());

    EXPECT_TRUE(passed);

@ -196,12 +200,12 @@ struct Testbed3xTensorBroadcast {
      }

      file
-        << "A =\n" << impl_.tensor_A.host_view()
-        << "\nB =\n" << impl_.tensor_B.host_view()
-        << "\nC0 =\n" << impl_.tensor_C.host_view()
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
+        << "\nC0 =\n" << impl_.collective_epilogue.tensor_C.host_view()
        << "\nC1 =\n" << tensor_C1.host_view()
-        << "\n\nReference =\n" << impl_.reference_D.host_view()
-        << "\n\nComputed =\n" <<impl_.tensor_D.host_view();
+        << "\n\nReference =\n" << impl_.collective_epilogue.reference_D.host_view()
+        << "\n\nComputed =\n" <<impl_.collective_epilogue.tensor_D.host_view();
    }

    return passed;
@ -221,40 +225,39 @@ struct Testbed3xTensorBroadcast {
    auto K = cute::get<2>(problem_shape_MNKL);
    auto L = cute::get<3>(problem_shape_MNKL);

-    auto A = cute::make_tensor(impl_.tensor_A.host_data(),
-        cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
-    auto B = cute::make_tensor(impl_.tensor_B.host_data(),
-        cute::make_layout(cute::make_shape(N, K, L), impl_.stride_b));
-    auto D = cute::make_tensor(impl_.reference_D.host_data(),
-        cute::make_layout(cute::make_shape(M, N, L), impl_.stride_d));
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+        cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+        cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto D = cute::make_tensor(impl_.collective_epilogue.reference_D.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
    auto Bias = cute::make_tensor(static_cast<ElementBias*>(use_bias ? bias.host_data() : nullptr),
        cute::make_layout(PerColBias ? cute::make_shape(1, N) : cute::make_shape(M, 1)));
-    auto C0 = cute::make_tensor(impl_.tensor_C.host_data(),
-        cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
+    auto C0 = cute::make_tensor(impl_.collective_epilogue.tensor_C.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
    auto C1 = cute::make_tensor(tensor_C1.host_data(),
-        cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));

    // Create host workspace for output of testbed. This computes a portion of the epilogue:
    //    ref_compute_out = Activation(alpha * (A @ B) + bias)
    cutlass::HostTensor<ElementCompute, LayoutTagC> ref_compute_out;
    auto c_coord = cutlass::make_Coord(M * L, N);
-    ref_compute_out.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.stride_factor_C), false);
+    ref_compute_out.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C), false);
    auto RefComputeOut = cute::make_tensor(ref_compute_out.host_data(),
-        cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));

    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};

    // Use a dummy null tensor for operand C because the epilogue overrides C.
    auto dummy_C = cute::make_tensor(static_cast<ElementC*>(nullptr),
-        cute::make_layout(cute::make_shape(M, N, L), impl_.stride_c));
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
    ElementCompute dummy_beta(0);
    auto dummy_Aux = cute::make_tensor(static_cast<ElementD*>(nullptr),
-        cute::make_layout(cute::make_shape(M, N, L), impl_.stride_d));
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
    auto dummy_Valpha = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
        cute::make_layout(cute::make_shape(M, 1)));
    auto dummy_Vbeta = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
        cute::make_layout(cute::make_shape(M, 1)));
-
    cutlass::reference::host::GettEpilogueParams<
        ElementScalar,
        ElementScalar,
@ -361,17 +364,17 @@ struct Testbed3xTensorBroadcast {
    arguments = typename Gemm::Arguments{
      cutlass::gemm::GemmUniversalMode::kGemm,
        problem_size,
-        { impl_.tensor_A.device_data(), impl_.stride_a,
-          impl_.tensor_B.device_data(), impl_.stride_b,
+        { impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+          impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b,
          impl_.mma_promotion_interval
        },
        { // Epilogue arguments
          { alpha, beta }, // ThreadOp arguments
-          impl_.stride_c,
-          impl_.tensor_D.device_data(),
-          impl_.stride_d,
+          impl_.collective_epilogue.stride_c,
+          impl_.collective_epilogue.tensor_D.device_data(),
+          impl_.collective_epilogue.stride_d,
          use_bias ? bias.device_data() : nullptr,
-          impl_.tensor_C.device_data(),
+          impl_.collective_epilogue.tensor_C.device_data(),
          tensor_C1.device_data()
        }, // Epilogue arguments end
        hw_info
--- a/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
@ -112,6 +112,8 @@ TEST(SM80_Device_Gemm_tf32t_tf32n_f32n_tensor_op_f32, 128x128x32_64x64x64) {
  EXPECT_TRUE(test::gemm::device::TestAll<Gemm>());
 }

+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_Device_Gemm_tf32t_tf32t_f32n_tensor_op_f32, 128x128x32_64x64x64) {
  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
@ -132,4 +134,24 @@ TEST(SM80_Device_Gemm_tf32t_tf32t_f32n_tensor_op_f32, 128x128x32_64x64x64) {

 /////////////////////////////////////////////////////////////////////////////////////////////////

+TEST(SM80_Device_Gemm_tf32t_tf32n_f32n_tensor_op_f32, 128x128x32_64x64x64_profiling) {
+  using Config = cutlass::gemm::device::DefaultGemmConfigurationToCutlass3Types<
+    cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::tfloat32_t, cutlass::layout::RowMajor,
+    cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::RowMajor,
+    float>;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      Config::CollectiveMainloop,
+      Config::CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  EXPECT_TRUE(test::gemm::device::TestGemmPerf3x<Gemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 //#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
@ -97,9 +97,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
  >;

  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  test::gemm::device::Testbed3x<Gemm, cutlass::epilogue::thread::ReLu> testbed;
-  bool passed = test::gemm::device::TestAll<Gemm>(1, 1, testbed);
+  bool passed = test::gemm::device::TestAll<Gemm, cutlass::epilogue::thread::ReLu>(1, 1);
  EXPECT_TRUE(passed);
 }

@ -156,6 +154,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
 #pragma GCC diagnostic pop // Re-enable deprecation warnings
 }

+
 TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 256x128x64_2x2x1_BiasF32_ReLU) {
  using LayoutA = cutlass::layout::RowMajor;
  using LayoutB = cutlass::layout::ColumnMajor;
@ -239,9 +238,8 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
  >;

  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  bool check_relative_equality = true;
-  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1, check_relative_equality);
+  using namespace test::gemm::device; 
+  bool passed = TestAllBiasElementwise<Gemm>(1, 1, CheckEquality::RELATIVE);
  EXPECT_TRUE(passed);
 }

@ -600,8 +598,8 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
  >;

  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1.0, 0.0, /*check_relative_equality=*/true);
+  using namespace test::gemm::device; 
+  bool passed = TestAllBiasElementwise<Gemm>(1.0, 0.0, CheckEquality::RELATIVE);
  EXPECT_TRUE(passed);
 }

--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
@ -97,8 +97,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
  >;

  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  test::gemm::device::Testbed3x<Gemm, cutlass::epilogue::thread::ReLu> testbed;
-  bool passed = test::gemm::device::TestAll<Gemm>(1, 1, testbed);
+  bool passed = test::gemm::device::TestAll<Gemm, cutlass::epilogue::thread::ReLu>(1, 1);
  EXPECT_TRUE(passed);
 }

@ -186,8 +185,8 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128

  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

-  bool check_relative_equality = true;
-  bool passed = test::gemm::device::TestAllBiasElementwise<Gemm>(1, 1, check_relative_equality);
+  using namespace test::gemm::device; 
+  bool passed = TestAllBiasElementwise<Gemm>(1, 1, CheckEquality::RELATIVE);
  EXPECT_TRUE(passed);
 }

--- a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu
@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
 *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
 *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
 *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
 *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
 *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
 *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
@ -50,6 +50,7 @@
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))

 TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_f32, 64x64x16_32x32x16) {

@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_f32, 64x64x16_32x32x16) {
  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
 }

+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////

 #endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
--- a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@ -50,6 +50,7 @@
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))

 TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_fast_f32, 64x64x16_32x32x16) {

@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
 }

+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////

 #endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
--- a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@ -50,6 +50,7 @@
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if (!((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 8)))

 TEST(SM80_Device_Syr2k_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {

@ -145,6 +146,7 @@ TEST(SM80_Device_Syr2k_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
 }

+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////

 #endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
--- a/test/unit/nvrtc/stdlib/assert.h
+++ b/test/unit/nvrtc/stdlib/assert.h
@ -0,0 +1,30 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/