Collection of changes to fix clang build. (#1200)

* Remove unused variables * Qualify calls to make_fragment_? from templated base class. Fixes clang build error. * Add missing `#include <cstdio>` * Various changes to fix clang compile errors. * More changes to fix clang build. Remaining issues: - `params` initializer of `CollectiveEpilogue`. - `ops` initializer of `Sm90VisitorImplBase`. - `__usAtomicCAS` needs to be added to clang upstream. * Fix remaining clang build issues. * Qualify `cute::rank()` calls. * Qualify some more calls that are otherwise ambiguous between `cute` and `std` namespace. * Double-escape special registers in inline asm. * small change --------- Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2023-12-08 20:42:12 +01:00
parent f4a0216601
commit e1483d5fa0
46 changed files with 308 additions and 273 deletions
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@ -193,8 +193,8 @@ struct TestbedImpl {

  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;

-  static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");

  static constexpr uint32_t mma_promotion_interval = 4;

@ -523,9 +523,6 @@ struct TestbedImpl {
    Gemm& gemm_op,
    typename Gemm::Arguments& arguments,
    cutlass::device_memory::allocation<uint8_t>& workspace) {
-    int M = cute::size<0>(problem_size);
-    int N = cute::size<1>(problem_size);
-    int K = cute::size<2>(problem_size);
    int L = 1;
    if constexpr(cute::rank(ProblemShapeType{}) == 4) {
      L = cute::size<3>(problem_size);
@ -581,7 +578,7 @@ struct TestbedImpl {
    cutlass::KernelHardwareInfo hw_info;
    hw_info.device_id = 0;
    if (not profiling) {
-      this->sm_count = min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      this->sm_count = std::min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
      hw_info.sm_count = this->sm_count;
    }
    else {
@ -1240,7 +1237,7 @@ struct Testbed3xFusionOperation {
    
    hw_info.device_id = 0;
    if (not profiling) {
-      impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
      hw_info.sm_count = impl_.sm_count;
    }
    else {
--- a/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
@ -173,7 +173,7 @@ public:
  HostScalarBroadcast(){}
  template<typename ProblemShapeType, typename TestBedImpl>
  HostScalarBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
-    :_scalar(ElementCompute(Value)), Base(check_relative_equality) {}
+    : Base(check_relative_equality), _scalar(ElementCompute(Value)) {}
  
  template <class ElementAccumulator>
  ElementCompute visit(
@ -232,7 +232,7 @@ public:
  HostRowBroadcast(){}
  template<typename ProblemShapeType>
  HostRowBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
-    :impl_(impl), Base(check_relative_equality) {
+    : Base(check_relative_equality), impl_(impl) {
    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
    _N = cute::get<1>(problem_shape_MNKL);
    _bias.resize(cutlass::Coord<1>(_N));
@ -300,7 +300,7 @@ public:
  HostColBroadcast(){}
  template<typename ProblemShapeType>
  HostColBroadcast(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
-    :impl_(impl), Base(check_relative_equality) {
+    : Base(check_relative_equality), impl_(impl) {
    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
    _M = cute::get<0>(problem_shape_MNKL);
    _bias.resize(cutlass::Coord<1>(_M));
@ -382,7 +382,7 @@ public:
  HostAuxLoad(){}
  template<typename ProblemShapeType>
  HostAuxLoad(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false)
-    :impl_(impl), Base(check_relative_equality){
+    : Base(check_relative_equality), impl_(impl){
    auto problem_shape_NMKL = cute::append<4>(problem_size, 1);
    auto [_M, _N, K, _L] = problem_shape_NMKL;
    auto aux_coord = cutlass::make_Coord(_M * _L, _N);
@ -513,8 +513,8 @@ public:
  HostUnaryCompute(){}
  template <typename ProblemShapeType, typename TestBedImpl>
  HostUnaryCompute(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
-    _child_0(problem_size, impl, check_relative_equality),
-    Base(check_relative_equality) { }
+    Base(check_relative_equality),
+    _child_0(problem_size, impl, check_relative_equality) { }

  template <class ElementAccumulator>
  ElementCompute visit(
@ -578,8 +578,8 @@ public:
  HostAuxStore(){}
  template <typename ProblemShapeType>
  HostAuxStore(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
-    impl_(impl),
-    Base(check_relative_equality) {
+    Base(check_relative_equality),
+    impl_(impl) {
    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
    auto [_M, _N, K, _L] = problem_shape_MNKL;
    auto aux_coord = cutlass::make_Coord(_M * _L, _N);
@ -677,8 +677,8 @@ public:
  HostRowReduce(){}
  template <typename ProblemShapeType>
  HostRowReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
-    impl_(impl),
-    Base(check_relative_equality) {
+    Base(check_relative_equality),
+    impl_(impl) {
    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
    _N = cute::get<1>(problem_shape_MNKL);
    _tensor_row_reduce.resize(cutlass::Coord<1>(_N));
@ -764,8 +764,8 @@ public:
  HostColumnReduce(){}
  template <typename ProblemShapeType>
  HostColumnReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
-    impl_(impl),
-    Base(check_relative_equality) {
+    Base(check_relative_equality),
+    impl_(impl) {
    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
    _M = cute::get<0>(problem_shape_MNKL);
    _tensor_column_reduce.resize(cutlass::Coord<1>(_M));
@ -850,9 +850,8 @@ public:
  HostScalarReduce(){}
  template <typename ProblemShapeType>
  HostScalarReduce(ProblemShapeType problem_size, TestBedImpl impl, bool check_relative_equality=false):
-    impl_(impl),
-    Base(check_relative_equality) {
-    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    Base(check_relative_equality),
+    impl_(impl) {
    _tensor_scalar_reduce.resize(cutlass::Coord<1>(1));
    _reference_scalar_reduce.resize(cutlass::Coord<1>(1));
    _reduce_buffer.resize(cutlass::Coord<1>(1));
@ -1229,7 +1228,6 @@ public:
    auto N = cute::get<1>(problem_shape_MNKL);
    auto K = cute::get<2>(problem_shape_MNKL);
    auto L = cute::get<3>(problem_shape_MNKL);
-    auto coord_0 = cutlass::make_Coord(0);

    auto A = cute::make_tensor(impl_.tensor_A.host_data(),
      cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
@ -1307,7 +1305,7 @@ public:
    cutlass::KernelHardwareInfo hw_info;
    hw_info.device_id = 0;
    if (not profiling) {
-      impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
      hw_info.sm_count = impl_.sm_count;
    }
    else {
--- a/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
@ -158,7 +158,6 @@ struct Testbed3xTensorBroadcast {
      bool use_bias)
  {
    auto [M, N, K, L] = problem_shape_MNKL;
-    auto coord_0 = cutlass::make_Coord(0);

    impl_.tensor_D.sync_host();
    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.tensor_A.host_view()), 0);
@ -218,7 +217,6 @@ struct Testbed3xTensorBroadcast {
    auto N = cute::get<1>(problem_shape_MNKL);
    auto K = cute::get<2>(problem_shape_MNKL);
    auto L = cute::get<3>(problem_shape_MNKL);
-    auto coord_0 = cutlass::make_Coord(0);

    auto A = cute::make_tensor(impl_.tensor_A.host_data(),
        cute::make_layout(cute::make_shape(M, K, L), impl_.stride_a));
@ -338,7 +336,7 @@ struct Testbed3xTensorBroadcast {
    cutlass::KernelHardwareInfo hw_info;
    hw_info.device_id = 0;
    if (not profiling) {
-      impl_.sm_count = min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
      hw_info.sm_count = impl_.sm_count;
    }
    else {
--- a/test/unit/gemm/device/sm90_evt_operations.hpp
+++ b/test/unit/gemm/device/sm90_evt_operations.hpp
@ -163,7 +163,7 @@ public:
  using EVTModule = HEVT<
  HostAuxStore<Gemm, true>,
  HEVT<
-    HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>,  // activation(Z) * scaled_d
+    HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,  // activation(Z) * scaled_d
    HEVT<
      HostCompute<Gemm, ActivationFn>, // activation(Z)
      HEVT<
@ -174,11 +174,11 @@ public:
          HostCompute<Gemm, cutlass::homogeneous_multiply_add>,
          HostScalarBroadcast<Gemm, 1, 3>, // scale_a * scale_b * alpha
          HostAccumulator<Gemm>,
-          HostColBroadcast<Gemm, ElementD>,
+          HostColBroadcast<Gemm, ElementD>
        >
      >
    >,
-    HostScalarBroadcast<Gemm, 1>, // scale_d
+    HostScalarBroadcast<Gemm, 1> // scale_d
  >
  >;
 };
@ -211,26 +211,26 @@ public:
          HostCompute<Gemm, cutlass::homogeneous_multiply_add>,
          HostScalarBroadcast<Gemm, 1, 3>, // scale_a * scale_b * alpha
          HostAccumulator<Gemm>,
-          HostColBroadcast<Gemm, ElementD>,
+          HostColBroadcast<Gemm, ElementD>
        >
      >,
      // D = activation(Z) * scaled_d, amax_d = max(abs(elements in D))
      HEVT<
-        HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>,
+        HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
        HEVT<
          HostScalarReduce<Gemm, amax, float>,
          HEVT<
            HostCompute<Gemm, ActivationFn>, //activation(Z) * scaled_d
-            HostAccumulator<Gemm>, // Z
+            HostAccumulator<Gemm> // Z
          >
        >,
-        HostScalarBroadcast<Gemm, 1>, // scale_d
+        HostScalarBroadcast<Gemm, 1> // scale_d
      >,
      // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
      HEVT<
        HostAuxStore<Gemm, false, ElementD, cutlass::layout::RowMajor>,
        HEVT<
-          HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::Op>,
+          HostCompute<Gemm, cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
          HEVT<
            HostScalarReduce<Gemm, amax, float>,
            HostAccumulator<Gemm>