v4.0 update. (#2371)

2025-06-06 14:39:20 +08:00
parent 2e2af190bd
commit 8bdbfca682
254 changed files with 29751 additions and 1980 deletions
--- a/include/cute/algorithm/axpby.hpp
+++ b/include/cute/algorithm/axpby.hpp
@ -33,7 +33,6 @@
 #include <cute/config.hpp>

 #include <cute/tensor_impl.hpp>
-#include <cute/tensor_predicate.hpp>

 namespace cute
 {
@ -45,7 +44,7 @@ template <class Alpha,
          class XEngine, class XLayout,
          class Beta,
          class YEngine, class YLayout,
-          class PrdTensor = TrivialPredTensor>
+          class PrdTensor = constant_fn<true_type>>
 CUTE_HOST_DEVICE
 void
 axpby(Alpha                    const& alpha,
@ -64,7 +63,7 @@ template <class Alpha,
          class XEngine, class XLayout,
          class Beta,
          class YEngine, class YLayout,
-          class PrdTensor = TrivialPredTensor>
+          class PrdTensor = constant_fn<true_type>>
 CUTE_HOST_DEVICE
 void
 axpby(Alpha                    const& alpha,
--- a/include/cute/algorithm/cooperative_copy.hpp
+++ b/include/cute/algorithm/cooperative_copy.hpp
@ -36,7 +36,6 @@
 #include <cute/swizzle.hpp>         // cute::Swizzle
 #include <cute/swizzle_layout.hpp>  // cute::get_nonswizzle_portion
 #include <cute/tensor_impl.hpp>     // cute::Tensor
-#include <cute/tensor_predicate.hpp>
 #include <cute/algorithm/copy.hpp>
 #include <cute/atom/copy_atom.hpp>

--- a/include/cute/algorithm/copy.hpp
+++ b/include/cute/algorithm/copy.hpp
@ -32,7 +32,6 @@

 #include <cute/config.hpp>            // CUTE_HOST_DEVICE
 #include <cute/tensor_impl.hpp>       // cute::Tensor
-#include <cute/tensor_predicate.hpp>  // cute::TrivialPredTensor
 #include <cute/atom/copy_atom.hpp>    // cute::Copy_Atom

 namespace cute
@ -66,10 +65,45 @@ copy_if(PrdTensor                    const& pred,
 // copy_if -- Predicated CopyAtom
 //

+// Predicate Tensor is an Actual Tensor
+template <class... CopyArgs,
+          class PrdEngine, class PrdLayout,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(Copy_Atom<CopyArgs...>       const& copy_atom,
+        Tensor<PrdEngine, PrdLayout> const& prd,       // ([V],Rest...)
+        Tensor<SrcEngine, SrcLayout> const& src,       // ( V, Rest...)
+        Tensor<DstEngine, DstLayout>      & dst)       // ( V, Rest...)
+{
+  if constexpr (PrdLayout::rank == SrcLayout::rank - 1) {
+    // Back-compat ONLY -- Delete?
+    copy_if(copy_atom, make_tensor(prd.data(), prepend(prd.layout(), Layout<_1,_0>{})), src, dst);
+  } else {
+    static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch.");
+    static_assert(SrcLayout::rank == PrdLayout::rank, "CopyAtom rank-mismatch.");
+
+    if constexpr (SrcLayout::rank == 1) {   // Dispatch the copy
+      copy_atom.call(prd, src, dst);
+    } else {                                // Loop over all but the first mode
+      constexpr int R = SrcLayout::rank;
+      Tensor prd_v = group_modes<1,R>(prd);
+      Tensor src_v = group_modes<1,R>(src);
+      Tensor dst_v = group_modes<1,R>(dst);
+      CUTE_UNROLL
+      for (int i = 0; i < size<1>(dst_v); ++i) {
+        copy_atom.call(prd_v(_,i), src_v(_,i), dst_v(_,i));
+      }
+    }
+  }
+}
+
 template <class... CopyArgs,
          class PredTensor,
          class SrcEngine, class SrcLayout,
          class DstEngine, class DstLayout>
+[[deprecated("Use a bool-tensor or transform-tensor as predication.")]]
 CUTE_HOST_DEVICE
 void
 copy_if(Copy_Atom<CopyArgs...>       const& copy_atom,
@ -77,33 +111,14 @@ copy_if(Copy_Atom<CopyArgs...>       const& copy_atom,
        Tensor<SrcEngine, SrcLayout> const& src,       // (V,Rest...)
        Tensor<DstEngine, DstLayout>      & dst)       // (V,Rest...)
 {
-  static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch.");
-  auto has_with_bool = cute::is_valid([](auto t)->void_t<decltype(declval<typename decltype(t)::Traits>().with(true))>{}, copy_atom);
-
-  if constexpr (SrcLayout::rank == 1) {   // Dispatch the copy
-    if constexpr (has_with_bool) {
-      copy_atom.with(pred()).call(src, dst);
-    } else {
-      if (pred()) { copy_atom.call(src, dst); }
-    }
-  } else {                                // Loop over all but the first mode
-    constexpr int R = SrcLayout::rank;
-    Tensor src_v = group_modes<1,R>(src);
-    Tensor dst_v = group_modes<1,R>(dst);
-    CUTE_UNROLL
-    for (int i = 0; i < size<1>(dst_v); ++i) {
-      if constexpr (has_with_bool) {
-        copy_atom.with(pred(i)).call(src_v(_,i), dst_v(_,i));
-      } else {
-        if (pred(i)) { copy_atom.call(src_v(_,i), dst_v(_,i)); }
-      }
-    }
-  }
+  Tensor tpred = cute::lazy::transform(make_tensor(counting_iterator<int>{}, replace<0>(shape(dst), _1{})), pred);
+  return copy_if(copy_atom, tpred, src, dst);
 }

 //
 // copy_if -- AutoCopyAsync
 //
+
 template <class PrdTensor,
          class SrcEngine, class SrcLayout,
          class DstEngine, class DstLayout>
@ -159,7 +174,7 @@ copy(AutoCopyAsync                const& cpy,
     Tensor<SrcEngine, SrcLayout> const& src,       // (V,Rest...)
     Tensor<DstEngine, DstLayout>      & dst)       // (V,Rest...)
 {
-  copy_if(cpy, TrivialPredTensor{}, src, dst);
+  copy_if(cpy, constant_fn<true_type>{}, src, dst);
 }

 //
@ -202,7 +217,7 @@ copy(Copy_Atom<CopyArgs...>       const& copy_atom,
      Tensor dst_c = dst_n(make_coord(_,Int<0>{}),make_coord(Int<0>{},_));        // (V, Rest)
      Tensor src_c = src_n(make_coord(_,Int<0>{}),make_coord(Int<0>{},_));        // (V, Rest)

-      CUTE_STATIC_ASSERT_V(size<1>(src_c) == size<1>(dst_c));
+      CUTE_STATIC_ASSERT_V( size<1>(src_c) ==  size<1>(dst_c));
      CUTE_STATIC_ASSERT_V(shape<0>(dst_c) == shape<0>(dst));
      CUTE_STATIC_ASSERT_V(shape<0>(src_c) == shape<0>(src));

@ -224,7 +239,7 @@ copy(Copy_Atom<CopyArgs...>       const& copy_atom,
 ////////////////////////////////////////////////////////

 // Specialization for AutoVectorizingCopyAssumedAlignment<MaxVecBits>
-template <int MaxVecBits, class... Args,
+template <int MaxVecBits,
          class SrcEngine, class SrcLayout,
          class DstEngine, class DstLayout>
 CUTE_HOST_DEVICE
@ -234,23 +249,30 @@ copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits> const&,
     Tensor<DstEngine, DstLayout>                             & dst)
 {
  constexpr int common_elem = CUTE_STATIC_V(max_common_vector(src, dst));
-  constexpr int align_bits  = CUTE_STATIC_V(gcd(max_alignment(src), max_alignment(dst), Int<MaxVecBits>{}));
-  static_assert(is_integral<decltype(Int<common_elem>{} * sizeof_bits_v<typename SrcEngine::value_type>)>::value, "Error: Attempting a subbit copy!");
-  constexpr int vec_bits    = gcd(common_elem * sizeof_bits_v<typename SrcEngine::value_type>, align_bits);
+  static_assert(is_integral<decltype(Int<common_elem>{} * sizeof_bits_v<typename DstEngine::value_type>)>::value, "Error: Attempting a subbit write!");

-  if constexpr (common_elem > 1 && ((vec_bits % 8) == 0)) {
-    // If more than one element vectorizes to 8bits or more, then recast and copy
-    using VecType = uint_bit_t<vec_bits>;
-    // Preserve volatility
-    using SrcVecType = conditional_t<is_volatile_v<typename SrcEngine::element_type>, VecType const volatile, VecType const>;
-    using DstVecType = conditional_t<is_volatile_v<typename DstEngine::element_type>, VecType       volatile, VecType      >;
+  if constexpr (common_elem > 1)
+  {
+    constexpr int align_bits = CUTE_STATIC_V(gcd(max_alignment(src), max_alignment(dst), Int<MaxVecBits>{}));
+    constexpr int vec_bits   = gcd(common_elem * sizeof_bits_v<typename SrcEngine::value_type>, align_bits);

-    // Recast
-    Tensor src_v = recast<SrcVecType>(src);
-    Tensor dst_v = recast<DstVecType>(dst);
-    return copy_if(TrivialPredTensor{}, src_v, dst_v);
+    if constexpr ((vec_bits % 8) == 0)
+    {
+      // If more than one element vectorizes to 8bits or more, then recast and copy
+      using VecType = uint_bit_t<vec_bits>;
+      // Preserve volatility
+      using SrcVecType = conditional_t<is_volatile_v<typename SrcEngine::element_type>, VecType const volatile, VecType const>;
+      using DstVecType = conditional_t<is_volatile_v<typename DstEngine::element_type>, VecType       volatile, VecType      >;
+
+      // Recast
+      Tensor src_v = recast<SrcVecType>(src);
+      Tensor dst_v = recast<DstVecType>(dst);
+      return copy_if(constant_fn<true_type>{}, src_v, dst_v);
+    } else {
+      return copy_if(constant_fn<true_type>{}, src, dst);
+    }
  } else {
-    return copy_if(TrivialPredTensor{}, src, dst);
+    return copy_if(constant_fn<true_type>{}, src, dst);
  }
 }

@ -277,7 +299,7 @@ copy(AutoFilter<CopyOp>           const& copy_op,
    Tensor src_n = zipped_divide(src, dst_null);

    CUTE_STATIC_ASSERT_V(cosize<0>(dst_n.layout()) == Int<1>{}, "Nullspace definition error");
-    CUTE_STATIC_ASSERT_V(cosize<0>(src_n.layout()) == Int<1>{}, "Error: Ambiguous scatter detected in copy");
+    CUTE_STATIC_ASSERT_V(cosize<0>(src_n.layout()) == Int<1>{}, "Error: Ambiguous race-condition detected.");

    copy(copy_op.base, src_n(Int<0>{},_), dst_n(Int<0>{},_));
  } else {
@ -335,6 +357,18 @@ copy(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>, Args...> con
  return copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>{}, src, dst);
 }

+template <int MaxVecBits, class... Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<Copy_Traits<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>>, Args...> const&,
+     Tensor<SrcEngine, SrcLayout>                                                         const& src,
+     Tensor<DstEngine, DstLayout>                                                              & dst)
+{
+  return copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>{}, src, dst);
+}
+
 #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
 template <class... CT_Args,
          class SrcEngine, class SrcLayout,
@ -375,8 +409,8 @@ template <class... CT_Args, class... CA_Args,
 CUTE_HOST_DEVICE
 void
 copy(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
-     Tensor<SrcEngine, SrcLayout>                const& src,
-     Tensor<DstEngine, DstLayout>                     & dst)
+     Tensor<SrcEngine, SrcLayout>                                        const& src,
+     Tensor<DstEngine, DstLayout>                                             & dst)
 {
  return copy(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src, dst);
 }
--- a/include/cute/algorithm/prefetch.hpp
+++ b/include/cute/algorithm/prefetch.hpp
@ -90,18 +90,19 @@ constexpr bool has_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = true;

 } // end namespace detail

-template <class CopyOp, class... CT_Args, class... CA_Args,
+template <class CopyOp, class... CT_Args, class CopyType,
          class GEngine, class GLayout>
 CUTE_HOST_DEVICE
 void
-prefetch(Copy_Atom<Copy_Traits<CopyOp, CT_Args...>, CA_Args...> const& atom,
-         Tensor<GEngine, GLayout>                               const& src)
+prefetch(Copy_Atom<Copy_Traits<CopyOp, CT_Args...>, CopyType> const& atom,
+         Tensor<GEngine, GLayout>                             const& src)
 {
  if constexpr (detail::has_prefetch<CopyOp>) {
    using Prefetch_Traits = Copy_Traits<typename CopyOp::PREFETCH, CT_Args...>;
-    using Prefetch_Atom = Copy_Atom<Prefetch_Traits, CA_Args...>;
+    using Prefetch_Atom = Copy_Atom<Prefetch_Traits, CopyType>;
    Prefetch_Atom prefetch_atom{atom};
-    auto& dst = const_cast<Tensor<GEngine, GLayout>&>(src); // dst is ignored for prefetch atoms
+    //auto& dst = const_cast<Tensor<GEngine, GLayout>&>(src); // dst is ignored for prefetch atoms
+    Tensor dst = make_tensor(make_smem_ptr<CopyType>(nullptr), shape(src));
    return copy(prefetch_atom, src, dst);
  } else {
    return prefetch(src);
--- a/include/cute/algorithm/tensor_algorithms.hpp
+++ b/include/cute/algorithm/tensor_algorithms.hpp
@ -163,4 +163,16 @@ transform(Tensor<EngineIn1,LayoutIn1> const& tensor_in1,
  return transform(tensor_in1, tensor_in2, tensor_out, op);
 }

+namespace lazy {
+
+template <class Engine, class Layout, class Fn>
+CUTE_HOST_DEVICE constexpr
+auto
+transform(cute::Tensor<Engine,Layout> const& t, Fn const& fn)
+{
+  return cute::make_tensor(cute::make_transform_iter(fn, t.data()), t.layout());
+}
+
+} // end namespace lazy
+
 } // end namespace cute
--- a/include/cute/algorithm/tensor_reduce.hpp
+++ b/include/cute/algorithm/tensor_reduce.hpp
@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <iostream>
+
+#include <cute/config.hpp>
+#include <cute/tensor_impl.hpp>
+#include <cute/algorithm/functional.hpp>
+#include <cute/algorithm/fill.hpp>
+
+namespace cute
+{
+
+// Reduce @src tensor using binary reduction operator @op and initial value @init and return a scalar.
+template <class SrcEngine, class SrcLayout, class T, class BinaryOp = cute::plus>
+CUTE_HOST_DEVICE constexpr
+T
+reduce(Tensor<SrcEngine,SrcLayout> const& src, T init, BinaryOp op = {})
+{
+  for (auto i = 0; i < size(src); ++i) {
+    init = op(init, src(i));
+  }
+  return init;
+}
+
+// Reduce @src tensor RedMode using binary reduction operator @op and store the result in @dst tensor
+// for each index in @dst/BatchMode.
+// @pre @src tensor has rank 2
+// @pre size of @src batch mode is equal to size of @dst batch mode
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout,
+          class BinaryOp = cute::plus>
+CUTE_HOST_DEVICE constexpr
+void
+batch_reduce(Tensor<SrcEngine, SrcLayout> const& src,       // (RedMode, BatchMode)
+             Tensor<DstEngine, DstLayout>      & dst,       // (BatchMode)
+             BinaryOp op = {})
+{
+  // Precondition
+  CUTE_STATIC_ASSERT_V(rank(src) == Int<2>{});
+  assert(size<1>(src) == size(dst));
+
+  for (int i = 0; i < size(dst); ++i) {
+    dst(i) = reduce(src(_,i), dst(i), op);
+  }
+}
+
+
+// Reduce @src tensor along selected modes specified in @target_profile using binary reduction operator @op
+// and store the result in @dst tensor. @target_profile is a tuple where '_' indicates modes to keep and 
+// integers indicates modes to reduce.
+// @pre @target_profile is compatible with @src layout
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout,
+          class TargetProfile,
+          class BinaryOp = cute::plus>
+CUTE_HOST_DEVICE constexpr
+void
+logical_reduce(Tensor<SrcEngine, SrcLayout> const& src,
+               Tensor<DstEngine, DstLayout>      & dst,
+               TargetProfile                const& target_profile,
+               BinaryOp op = {})
+{
+  // Precondition
+  assert(compatible(target_profile, shape(src)));
+
+  auto diced_layout = dice(target_profile, src.layout());
+  auto sliced_layout = slice(target_profile, src.layout());
+
+  auto red_mode = conditional_return<rank(diced_layout) == Int<0>{}>(Layout<_1,_0>{}, diced_layout);
+  auto batch_mode = conditional_return<rank(sliced_layout) == Int<0>{}>(Layout<_1,_0>{}, sliced_layout);
+
+  auto src_tensor = make_tensor(src.data(), make_layout(red_mode, batch_mode));
+
+  batch_reduce(src_tensor, dst, op);
+}
+
+} // end namespace cute