3.6.0 update (#2005)

* 3.6.0 update * doc and swap stuff --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2024-12-24 22:34:40 -08:00
parent e1cd8c7866
commit 3d261a5974
258 changed files with 10863 additions and 3883 deletions
--- a/include/cute/tensor_impl.hpp
+++ b/include/cute/tensor_impl.hpp
@ -84,6 +84,8 @@ struct ArrayEngine
 };

 // Specialization for sparse_elem<S,T> tensor allocation/iteration
+// NOTE: This can and should be used for allocation of SMEM as well!
+//       Fuse these two ArrayEngines?
 template <int S, class T, size_t N>
 struct ArrayEngine<sparse_elem<S,T>, N>
 {
@ -858,6 +860,17 @@ max_common_layout(Tensor<SrcEngine,SrcLayout> const& a,
  CUTE_GCC_UNREACHABLE;
 }

+/* Return the maximum (statically known) alignment of a Tensor in the number of bits
+ */
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(Tensor<Engine,Layout> const& t)
+{
+  return gcd(max_alignment(t.data()),
+             max_alignment(t.layout()) * static_value<sizeof_bits<typename Engine::value_type>>());
+}
+
 //
 // Key algebraic operations -- Composition, Divide, and Product
 //