Updates and Bug fixes to CUTLASS 3.3 (#1232)

2023-12-05 06:50:49 -08:00
parent 4a1709e17e
commit e9e30c2304
31 changed files with 534 additions and 227 deletions
--- a/include/cute/atom/copy_atom.hpp
+++ b/include/cute/atom/copy_atom.hpp
@ -353,7 +353,7 @@ struct ThrCopy
  template <class STensor>
  CUTE_HOST_DEVICE
  auto
-  partition_S(STensor&& stensor) {
+  partition_S(STensor&& stensor) const {
    //static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
    //              "Expected ValType for tiling SrcTensor.");
    auto thr_tensor = make_tensor(std::forward<STensor>(stensor).data(), TiledCopy::tidfrg_S(stensor.layout()));
@ -363,7 +363,7 @@ struct ThrCopy
  template <class DTensor>
  CUTE_HOST_DEVICE
  auto
-  partition_D(DTensor&& dtensor) {
+  partition_D(DTensor&& dtensor) const {
    //static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
    //              "Expected ValType for tiling DstTensor.");
    auto thr_tensor = make_tensor(std::forward<DTensor>(dtensor).data(), TiledCopy::tidfrg_D(dtensor.layout()));
@ -479,10 +479,10 @@ make_tiled_copy_C_atom(Copy_Atom<Args...> const& copy_atom,
  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
 }

-/** Produce a TiledCopy from logical thread and values layouts. 
- * The thread and value layouts map coordinates to thr_idx and val_idx. 
+/** Produce a TiledCopy from logical thread and values layouts.
+ * The thread and value layouts map coordinates to thr_idx and val_idx.
 *    The product of these layouts is taken to produce the TV layout and the Tiler.
- * Useful when threads and values need very specific mappings onto coordinates 
+ * Useful when threads and values need very specific mappings onto coordinates
 *    in the target tensors.
 */
 template <class... Args,
@ -510,16 +510,16 @@ make_tiled_copy(Copy_Atom<Args...> const& copy_atom,
  return make_tiled_copy_impl(copy_atom, layout_tv, product_each(shape(layout_mn)));
 }

-/** Produce a TiledCopy from thread and value offset maps. 
+/** Produce a TiledCopy from thread and value offset maps.
 * The TV Layout maps threads and values to the codomain of the data_layout.
- * It is verified that the intended codomain is valid within data_layout. 
+ * It is verified that the intended codomain is valid within data_layout.
 * Useful when threads and values don't care about owning specific coordinates, but
 *   care more about the vector-width and offsets between them.
 */
 template <class... Args, class AtomTVLayout, class DataLayout>
 CUTE_HOST_DEVICE constexpr
 auto
-make_cotiled_copy(Copy_Atom<Args...> const& copy_atom, 
+make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
                  AtomTVLayout const& atom_tv_layout,   // atom (thr,val) -> data addr
                  DataLayout   const& data_layout)      // coord          -> data addr    The target layout
 {
--- a/include/cute/atom/copy_traits.hpp
+++ b/include/cute/atom/copy_traits.hpp
@ -59,7 +59,7 @@ namespace cute
 template <class CopyOperation, class... CopyOpArgs>
 struct Copy_Traits
 {
-  static_assert(sizeof(CopyOperation) == 0, "Copy_Traits not implemented for this Copy_Operation.");
+  static_assert(dependent_false<CopyOperation>, "Copy_Traits not implemented for this CopyOperation.");
 };

 template <class S, class D>
@ -77,8 +77,8 @@ struct Copy_Traits<UniversalCopy<S,D>>
  using RefLayout = SrcLayout;
 };

-template <>
-struct Copy_Traits<DefaultCopy>
+template <int MaxVecBits>
+struct Copy_Traits<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>>
 {
  // Logical thread id to thread idx (one-thread)
  using ThrID = Layout<_1>;
@ -108,23 +108,24 @@ copy_explode(PtrS&& s, int_sequence<Is...>,
 } // end namespace detail

 //
-// Generic copy_unpack for any Copy_Traits
+// Generic copy_unpack for common argument-based Copy_Traits
 //
-template <class Operation, class... Args,
-          class TS, class SLayout,
-          class TD, class DLayout>
+
+template <class CopyOp, class... Args,
+          class SEngine, class SLayout,
+          class DEngine, class DLayout>
 CUTE_HOST_DEVICE constexpr
 void
-copy_unpack(Copy_Traits<Operation, Args...> const&,
-            Tensor<TS,SLayout> const& src,
-            Tensor<TD,DLayout>      & dst)
+copy_unpack(Copy_Traits<CopyOp,Args...> const&,
+            Tensor<SEngine,SLayout>     const& src,
+            Tensor<DEngine,DLayout>          & dst)
 {
  // Specializations can generalize on these checks
-  //static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<Operation>");
-  //static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<Operation>");
+  //static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<CopyOp>");
+  //static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<CopyOp>");

-  using RegistersSrc = typename Operation::SRegisters;
-  using RegistersDst = typename Operation::DRegisters;
+  using RegistersSrc = typename CopyOp::SRegisters;
+  using RegistersDst = typename CopyOp::DRegisters;
  using RegTypeSrc   = typename remove_extent<RegistersSrc>::type;
  using RegTypeDst   = typename remove_extent<RegistersDst>::type;
  constexpr int RegNumSrc = extent<RegistersSrc>::value;
@ -134,26 +135,26 @@ copy_unpack(Copy_Traits<Operation, Args...> const&,
  Tensor rD = recast<RegTypeDst>(dst);

  CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
-    "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
+    "Copy_Traits: src failed to vectorize into registers. Layout is incompatible with this CopyOp.");
  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
-    "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
+    "Copy_Traits: dst failed to vectorize into registers. Layout is incompatible with this CopyOp.");

-  detail::copy_explode<Operation>(rS, make_int_sequence<RegNumSrc>{},
-                                  rD, make_int_sequence<RegNumDst>{});
+  detail::copy_explode<CopyOp>(rS, make_int_sequence<RegNumSrc>{},
+                               rD, make_int_sequence<RegNumDst>{});
 }

 //
 // Accept mutable temporaries
 //

-template <class Operation, class... Args,
-          class TS, class SLayout,
-          class TD, class DLayout>
+template <class CopyOp, class... Args,
+          class SEngine, class SLayout,
+          class DEngine, class DLayout>
 CUTE_HOST_DEVICE constexpr
 void
-copy_unpack(Copy_Traits<Operation, Args...> const& traits,
-            Tensor<TS,SLayout> const&  src,
-            Tensor<TD,DLayout>      && dst)
+copy_unpack(Copy_Traits<CopyOp,Args...> const& traits,
+            Tensor<SEngine,SLayout>     const& src,
+            Tensor<DEngine,DLayout>         && dst)
 {
  copy_unpack(traits, src, dst);
 }