CUTLASS 3.6.0 (#1850)

* v3.6 * update changelog * update readme * fix typo * fixing typos * hopper gemm with weight prefetch --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2024-10-09 12:33:27 -07:00
parent 0837a2a00a
commit cc3c29a81a
354 changed files with 105943 additions and 8203 deletions
--- a/include/cute/tensor_impl.hpp
+++ b/include/cute/tensor_impl.hpp
@ -41,18 +41,16 @@

 #pragma once

-#include <cute/config.hpp>
-
-#include <cute/util/type_traits.hpp>
-#include <cute/numeric/integral_constant.hpp>
-#include <cute/numeric/integer_sequence.hpp>
-
-#include <cute/container/tuple.hpp>
-#include <cute/container/array_aligned.hpp>
-#include <cute/container/array_subbyte.hpp>
-
-#include <cute/pointer.hpp>
-#include <cute/layout.hpp>
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/layout.hpp>                     // cute::Shape
+#include <cute/layout_composed.hpp>            // cute::is_composed_layout
+#include <cute/pointer.hpp>                    // cute::recast_ptr
+#include <cute/pointer_base.hpp>               // cute::iterator_traits
+#include <cute/container/array_aligned.hpp>    // cute::array_aligned
+#include <cute/container/array_subbyte.hpp>    // cute::array_subbyte
+#include <cute/container/tuple.hpp>            // cute::tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::is_integral
+#include <cute/util/type_traits.hpp>           // __CUTE_REQUIRES

 namespace cute
 {
@ -69,7 +67,7 @@ namespace cute
 //   iterator begin();
 // };

-template <class T, int N>
+template <class T, size_t N>
 struct ArrayEngine
 {
  using Storage = typename conditional<(sizeof_bits<T>::value % 8 == 0),
@ -85,6 +83,24 @@ struct ArrayEngine
  CUTE_HOST_DEVICE constexpr auto begin()       { return storage_.begin(); }
 };

+// Specialization for sparse_elem<S,T> tensor allocation/iteration
+template <int S, class T, size_t N>
+struct ArrayEngine<sparse_elem<S,T>, N>
+{
+  static_assert(N % S == 0, "Expected a multiple of the sparsity.");
+  using value_type   = sparse_elem<S,T>;
+  using Storage      = typename conditional<(sizeof_bits<T>::value % 8 == 0),
+                                            array_aligned<T,N/S>,
+                                            array_subbyte<T,N/S>>::type;
+  using iterator     = sparse_ptr<S,sparse_elem<S,T>*>;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  Storage storage_;
+
+  CUTE_HOST_DEVICE constexpr auto begin() const { return recast_ptr<value_type>(storage_.begin()); }
+  CUTE_HOST_DEVICE constexpr auto begin()       { return recast_ptr<value_type>(storage_.begin()); }
+};
+
 template <class Iterator>
 struct ViewEngine
 {
@ -622,6 +638,30 @@ filter_zeros(Tensor<Engine,Layout>&& tensor) {
  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
 }

+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout> const& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>&& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
 // Remove all of the 0-strides and 1-sizes
 template <class Engine, class Layout>
 CUTE_HOST_DEVICE constexpr
@ -755,10 +795,10 @@ auto
 max_common_vector(Tensor<SrcEngine,SrcLayout> const& a,
                  Tensor<DstEngine,DstLayout> const& b)
 {
-  using SrcType = typename Tensor<SrcEngine,SrcLayout>::value_type;
-  using DstType = typename Tensor<DstEngine,DstLayout>::value_type;
-  using SrcRef  = typename Tensor<SrcEngine,SrcLayout>::reference;
-  using DstRef  = typename Tensor<SrcEngine,SrcLayout>::reference;
+  using SrcType = typename SrcEngine::value_type;
+  using SrcRef  = typename SrcEngine::reference;
+  using DstType = typename DstEngine::value_type;
+  using DstRef  = typename DstEngine::reference;

  // Determine if vectorization candidates at all
  if constexpr (// Should be the same value_types, else the copy is also performing a cast
@ -795,10 +835,10 @@ auto
 max_common_layout(Tensor<SrcEngine,SrcLayout> const& a,
                  Tensor<DstEngine,DstLayout> const& b)
 {
-  using SrcType = typename Tensor<SrcEngine,SrcLayout>::value_type;
-  using DstType = typename Tensor<DstEngine,DstLayout>::value_type;
-  using SrcRef  = typename Tensor<SrcEngine,SrcLayout>::reference;
-  using DstRef  = typename Tensor<SrcEngine,SrcLayout>::reference;
+  using SrcType = typename SrcEngine::value_type;
+  using SrcRef  = typename SrcEngine::reference;
+  using DstType = typename DstEngine::value_type;
+  using DstRef  = typename DstEngine::reference;

  // Determine if vectorization candidates at all
  if constexpr (// Should be the same value_types, else the copy is also performing a cast