diff --git a/CHANGELOG.md b/CHANGELOG.md index c0a7ad8e..87f01574 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ * Sub-Byte type fixes and improvements. * EVT Support for RELU with Aux bitmap tensor store (used in dRELU). See [SM90 EVT fusions](/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp) for details. * Fusion support for backprop fusions including drelu, dgelu, and dbias. -* Support for void-C kernels and SM80 mixed-input GEMMs in the CUTLASS Python interface. +* Support for void-C kernels and SM80 mixed-input GEMMs in the CUTLASS Python interface ## [3.2.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.1) (2023-10-25) * Minor patch for issue/1138 diff --git a/README.md b/README.md index 36b0d151..4c43f1b9 100644 --- a/README.md +++ b/README.md @@ -50,8 +50,8 @@ CUTLASS 3.3.0 is an update to CUTLASS adding: - New [Copy Async based Hopper GEMMs](/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu) - which support lower than 16B aligned input tensors (across s8/fp8/fp16/bf16/tf32 types) with optimal performance. As a part of this, new kernel schedules, and Copy Ops [SM80\_CP\_ASYNC\_CACHE\_\*](/include/cute/arch/copy_sm80.hpp) were also added. - EVT Support for RELU with Aux bitmap tensor store (used in dRELU). See [SM90 EVT fusions](/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp) for details. - Various subbyte enhancements like tagged device ptrs, support for vectorized copy, various operators to treat subbyte iterators as pointers, and full-fledged CuTe Tensor support. -- Support for Clang as a host compiler. -- Support for void-C kernels and SM80 mixed-input GEMMs in the CUTLASS Python interface. +- Support for Clang as a host compiler. +- Support for void-C kernels and SM80 mixed-input GEMMs in the CUTLASS Python interface Minimum requirements: diff --git a/include/cute/algorithm/copy.hpp b/include/cute/algorithm/copy.hpp index 9506db79..8171dcad 100644 --- a/include/cute/algorithm/copy.hpp +++ b/include/cute/algorithm/copy.hpp @@ -32,6 +32,8 @@ #include +#include + #include #include @@ -44,30 +46,14 @@ namespace cute // Accept mutable temporaries // -template CUTE_HOST_DEVICE void -copy_if(PrdTensor const& pred, - Tensor const& src, - Tensor && dst) +copy(Tensor const& src, + Tensor && dst) { - return copy_if(pred, src, dst); -} - -template -CUTE_HOST_DEVICE -void -copy_if(Copy_Atom const& copy_atom, - PrdTensor const& pred, - Tensor const& src, - Tensor && dst) -{ - return copy_if(copy_atom, pred, src, dst); + return copy(src, dst); } template CUTE_HOST_DEVICE void -copy(Tensor const& src, - Tensor && dst) +copy_aligned(Tensor const& src, + Tensor && dst) { - return copy(src, dst); + return copy_aligned(src, dst); } -template CUTE_HOST_DEVICE void -copy(Copy_Atom const& copy_atom, +copy_if(PrdTensor const& pred, + Tensor const& src, + Tensor && dst) +{ + return copy_if(pred, src, dst); +} + +template +CUTE_HOST_DEVICE +void +copy_if(CopyPolicy const& copy_policy, + PrdTensor const& pred, + Tensor const& src, + Tensor && dst) +{ + return copy_if(copy_policy, pred, src, dst); +} + +template +CUTE_HOST_DEVICE +void +copy(CopyPolicy const& copy_policy, Tensor const& src, Tensor && dst) { - return copy(copy_atom, src, dst); + return copy(copy_policy, src, dst); } // @@ -135,7 +147,7 @@ namespace detail { // Trait that detects if atom's traits has a member function with(bool) template constexpr bool has_with_bool = false; - + template constexpr bool has_with_bool().with(declval()))>> = true; @@ -157,15 +169,14 @@ copy_if(Copy_Atom const& copy_atom, copy_atom.call(src, dst); } else { // Loop over all but the first mode constexpr int R = SrcLayout::rank; - auto src_v = group_modes<1,R>(src); - auto dst_v = group_modes<1,R>(dst); + Tensor src_v = group_modes<1,R>(src); + Tensor dst_v = group_modes<1,R>(dst); CUTE_UNROLL for (int i = 0; i < size<1>(src_v); ++i) { // If copy traits can be transformed with a predicate value, do it, otherwise branch here if constexpr (detail::has_with_bool>) { copy_atom.with(pred(i)).call(src_v(_,i), dst_v(_,i)); - } - else { + } else { if (pred(i)) { copy_atom.call(src_v(_,i), dst_v(_,i)); } @@ -186,23 +197,24 @@ void copy_vec(Tensor const& src, Tensor & dst) { + static_assert(sizeof_bits_v >= 8 && sizeof_bits_v % 8 == 0, + "Expected a vectorization type of at least a byte."); using SrcType = typename SrcEngine::element_type; using DstType = typename DstEngine::element_type; - if constexpr (sizeof(SrcType) == sizeof(DstType) && sizeof(VecType) > sizeof(DstType)) + if constexpr (sizeof_bits_v == sizeof_bits_v && + sizeof_bits_v > sizeof_bits_v) { - /* @pre is_aligned(src.data()) && - * is_aligned(dst.data()) - */ + // Preserve volatility of Src/Dst types. using SrcVecType = conditional_t, VecType const volatile, VecType const>; using DstVecType = conditional_t, VecType volatile, VecType >; - auto src_v = recast(src); - auto dst_v = recast(dst); + Tensor src_v = recast(src); + Tensor dst_v = recast(dst); #if 0 if (thread0()) { - print("copy_vec -- vectorizing copy from %3db to %3db\n", int(8*sizeof(SrcType)), int(8*sizeof(VecType))); - print(" "); print(layout(src)); print(" => "); print(layout(src_v)); print("\n"); - print(" "); print(layout(dst)); print(" => "); print(layout(dst_v)); print("\n"); + print("copy_vec<%db> -- vectorizing copy:\n", int(sizeof_bits_v)); + print(" "); print(src); print(" => "); print(src_v); print("\n"); + print(" "); print(dst); print(" => "); print(dst_v); print("\n"); } #endif @@ -210,9 +222,9 @@ copy_vec(Tensor const& src, } else { #if 0 if (thread0()) { - print("copy_vec -- not vectorizing, copy with %3db and %3db\n", int(8*sizeof(SrcType)), int(8*sizeof(DstType))); - print(" "); print(layout(src)); print("\n"); - print(" "); print(layout(dst)); print("\n"); + print("copy_vec<%db> -- NOT vectorizing copy:\n", int(sizeof_bits_v)); + print(" "); print(src); print("\n"); + print(" "); print(dst); print("\n"); } #endif @@ -220,36 +232,6 @@ copy_vec(Tensor const& src, } } -// -// copy -- auto-vectorizing copy -// - -template -CUTE_HOST_DEVICE -void -copy(Tensor const& src, - Tensor & dst) -{ - constexpr int N = decltype(max_common_vector(src, dst))::value; - -#if 0 - if (thread0()) { - print("copy -- found a max_common_vector of %d\n", N); - print(" "); print(src.data()); print(" o "); print(layout(src)); print("\n"); - print(" "); print(dst.data()); print(" o "); print(layout(dst)); print("\n"); - } -#endif - - if constexpr (N <= 1) { - return copy_if(TrivialPredTensor{}, src, dst); - } else { - constexpr int vec_bits = N * sizeof_bits::value; - using VecType = uint_bit_t; - return copy_vec(src, dst); - } -} - // // copy -- CopyAtom // @@ -266,23 +248,135 @@ copy(Copy_Atom const& copy_atom, return copy_if(copy_atom, TrivialPredTensor{}, src, dst); } -template -CUTE_HOST_DEVICE -void -copy(Copy_Atom const&, - Tensor const& src, - Tensor & dst) -{ - return copy(src, dst); -} - ////////////////////////////////////////// // Special Auto-Vectorizing Overloads ////////////////////////////////////////// +// Specialization for AutoVectorizingCopyAssumedAlignment +template +CUTE_HOST_DEVICE +void +copy(AutoVectorizingCopyWithAssumedAlignment const&, + Tensor const& src, + Tensor & dst) +{ + constexpr int vec_elem = decltype(max_common_vector(src, dst))::value; + + constexpr int src_bits = sizeof_bits::value; + // When layouts are static, accept vec_bits up to 128 + // When layouts are dynamic, accept vec_bits up to MaxVecBits + constexpr int vec_bits = (is_static::value && is_static::value) ? + cute::min(vec_elem * src_bits, 128) : + cute::min(vec_elem * src_bits, MaxVecBits); + +#if 0 + if (thread0()) { + print("copy -- found max_common_vector of %d elems and vectorization to %d bits\n", vec_elem, vec_bits); + print(" "); print(src); print("\n"); + print(" "); print(dst); print("\n"); + } +#endif + + if constexpr (vec_elem > 1 && vec_bits >= 8) { + return copy_vec>(src, dst); + } else { + return copy_if(TrivialPredTensor{}, src, dst); + } +} + +// Auto-vectorizing copy for static layouts +template +CUTE_HOST_DEVICE +void +copy(Tensor const& src, + Tensor & dst) +{ + return copy(AutoVectorizingCopy{}, src, dst); +} + +// Auto-vectorizing copy with assumed alignment of dynamic layout strides up to 128bit. +template +CUTE_HOST_DEVICE +void +copy_aligned(Tensor const& src, + Tensor & dst) +{ + return copy(AutoVectorizingCopyWithAssumedAlignment<128>{}, src, dst); +} + +// Specializaton for Atom AutoVectorizingCopy +template +CUTE_HOST_DEVICE +void +copy(Copy_Atom const&, + Tensor const& src, + Tensor & dst) +{ + return copy(AutoVectorizingCopy{}, src, dst); +} + +// Specializaton for Atom AutoVectorizingCopyAssumedAlignment +template +CUTE_HOST_DEVICE +void +copy(Copy_Atom, Args...> const&, + Tensor const& src, + Tensor & dst) +{ + return copy(AutoVectorizingCopyWithAssumedAlignment{}, src, dst); +} + #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED) +template +CUTE_HOST_DEVICE +void +copy(Copy_Traits const& atom, // Copy_Traits may or may not have the memory barrier in it already + Tensor const& src, + Tensor & dst) +{ + using SrcType = typename SrcEngine::value_type; + using DstType = typename DstEngine::value_type; + static_assert(sizeof_bits::value == sizeof_bits::value); + static_assert((is_gmem::value && is_smem::value) || + (is_smem::value && is_gmem::value), + "Bulk Copy only supports gmem -> smem or smem -> gmem movement."); + // G2S or S2G dispatch + using BULK_COPY_OP = conditional_t::value, + SM90_BULK_COPY_G2S, + SM90_BULK_COPY_S2G>; + + // Find the common subtensor of src and dst + auto tiler = max_common_layout(src, dst); + constexpr int vec_elem = decltype(size(tiler))::value; + constexpr int vec_bits = vec_elem * sizeof_bits_v; + static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP"); + + // Construct a new concrete Atom of the vector size + using BulkAtom = Copy_Atom, CT_Args...>, SrcType>; + auto bulk_atom = apply(atom.opargs_, [](auto const&... args) { return BulkAtom{args...}; }); + +#if 0 + if (thread0()) { + print("copy blkcp -- found a max_common_layout of "); print(tiler); print("\n"); + print(" "); print(src); print("\n"); + print(" "); print(dst); print("\n"); + } +#endif + + return copy(bulk_atom, logical_divide(src, tiler), logical_divide(dst, tiler)); +} + +// Backwards-compat. Throw out any extra Copy_Atom args. template @@ -292,36 +386,7 @@ copy(Copy_Atom, CA_Args...> const& Tensor const& src, Tensor & dst) { - using SrcType = typename SrcEngine::value_type; - using DstType = typename DstEngine::value_type; - static_assert(sizeof_bits::value == sizeof_bits::value); - static_assert((is_gmem::value && is_smem::value) || - (is_smem::value && is_gmem::value), - "Bulk Copy only supports gmem -> smem or smem -> gmem movement."); - // Do BulkCopy dispatch - using BULK_COPY_OP = conditional_t::value, - SM90_BULK_COPY_G2S, - SM90_BULK_COPY_S2G>; - - constexpr int N = decltype(max_common_vector(src, dst))::value; - - // Construct a new concrete Atom of the vector size - using N_BITS = Int::value>; - using COPY_ATOM = Copy_Atom, SrcType>; - auto bulk_atom = apply(atom.opargs_, [&](auto const&... args) { return COPY_ATOM{args...}; }); - - // Tile the src and dst to the Atom - auto tiler = right_inverse(dst.layout()).compose(Int{}); - -#if 0 - if (thread0()) { - print("copy -- found a max_common_vector of %d\n", N); - print(" "); print(src.data()); print(" o "); print(layout(src)); print("\n"); - print(" "); print(dst.data()); print(" o "); print(layout(dst)); print("\n"); - } -#endif - - return copy(bulk_atom, logical_divide(src, tiler), logical_divide(dst, tiler)); + return copy(static_cast const&>(atom), src, dst); } #endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED) diff --git a/include/cute/arch/copy.hpp b/include/cute/arch/copy.hpp index 8c2552ec..61773599 100644 --- a/include/cute/arch/copy.hpp +++ b/include/cute/arch/copy.hpp @@ -33,7 +33,7 @@ #include #include -#include +#include namespace cute { @@ -48,7 +48,7 @@ struct UniversalCopy using SRegisters = S[1]; using DRegisters = D[1]; - template + template CUTE_HOST_DEVICE static constexpr void copy(S_ const& src, D_ & dst) @@ -57,25 +57,36 @@ struct UniversalCopy } // Accept mutable temporaries - template + template CUTE_HOST_DEVICE static constexpr void copy(S_ const& src, D_ && dst) { - copy(src, dst); + UniversalCopy::copy(src, dst); } }; // -// Placeholder for the copy algorithm's default, auto-vectorizing behavior +// Placeholder for the copy algorithm's stronger auto-vectorizing behavior +// that assumes alignment of dynamic layouts up to MaxVecBits // -struct DefaultCopy +template +struct AutoVectorizingCopyWithAssumedAlignment + : UniversalCopy> { - using SRegisters = uint128_t[1]; - using DRegisters = uint128_t[1]; + static_assert(MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128, + "Expected MaxVecBits to be 8 or 16 or 32 or 64 or 128 for alignment and performance."); }; -using AutoVectorizingCopy = DefaultCopy; +// +// Placeholder for the copy algorithm's default auto-vectorizing behavior +// that does not assume alignment of dynamic layouts +// + +using AutoVectorizingCopy = AutoVectorizingCopyWithAssumedAlignment<8>; + +// Alias +using DefaultCopy = AutoVectorizingCopy; } // end namespace cute diff --git a/include/cute/atom/copy_atom.hpp b/include/cute/atom/copy_atom.hpp index 098aa8ca..de320a22 100644 --- a/include/cute/atom/copy_atom.hpp +++ b/include/cute/atom/copy_atom.hpp @@ -353,7 +353,7 @@ struct ThrCopy template CUTE_HOST_DEVICE auto - partition_S(STensor&& stensor) { + partition_S(STensor&& stensor) const { //static_assert(sizeof(typename remove_cvref_t::value_type) == sizeof(typename TiledCopy::ValType), // "Expected ValType for tiling SrcTensor."); auto thr_tensor = make_tensor(std::forward(stensor).data(), TiledCopy::tidfrg_S(stensor.layout())); @@ -363,7 +363,7 @@ struct ThrCopy template CUTE_HOST_DEVICE auto - partition_D(DTensor&& dtensor) { + partition_D(DTensor&& dtensor) const { //static_assert(sizeof(typename remove_cvref_t::value_type) == sizeof(typename TiledCopy::ValType), // "Expected ValType for tiling DstTensor."); auto thr_tensor = make_tensor(std::forward(dtensor).data(), TiledCopy::tidfrg_D(dtensor.layout())); @@ -479,10 +479,10 @@ make_tiled_copy_C_atom(Copy_Atom const& copy_atom, return make_tiled_copy_impl(copy_atom, layout_tv, tiler); } -/** Produce a TiledCopy from logical thread and values layouts. - * The thread and value layouts map coordinates to thr_idx and val_idx. +/** Produce a TiledCopy from logical thread and values layouts. + * The thread and value layouts map coordinates to thr_idx and val_idx. * The product of these layouts is taken to produce the TV layout and the Tiler. - * Useful when threads and values need very specific mappings onto coordinates + * Useful when threads and values need very specific mappings onto coordinates * in the target tensors. */ template const& copy_atom, return make_tiled_copy_impl(copy_atom, layout_tv, product_each(shape(layout_mn))); } -/** Produce a TiledCopy from thread and value offset maps. +/** Produce a TiledCopy from thread and value offset maps. * The TV Layout maps threads and values to the codomain of the data_layout. - * It is verified that the intended codomain is valid within data_layout. + * It is verified that the intended codomain is valid within data_layout. * Useful when threads and values don't care about owning specific coordinates, but * care more about the vector-width and offsets between them. */ template CUTE_HOST_DEVICE constexpr auto -make_cotiled_copy(Copy_Atom const& copy_atom, +make_cotiled_copy(Copy_Atom const& copy_atom, AtomTVLayout const& atom_tv_layout, // atom (thr,val) -> data addr DataLayout const& data_layout) // coord -> data addr The target layout { diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp index 53548f3a..14598f0f 100644 --- a/include/cute/atom/copy_traits.hpp +++ b/include/cute/atom/copy_traits.hpp @@ -59,7 +59,7 @@ namespace cute template struct Copy_Traits { - static_assert(sizeof(CopyOperation) == 0, "Copy_Traits not implemented for this Copy_Operation."); + static_assert(dependent_false, "Copy_Traits not implemented for this CopyOperation."); }; template @@ -77,8 +77,8 @@ struct Copy_Traits> using RefLayout = SrcLayout; }; -template <> -struct Copy_Traits +template +struct Copy_Traits> { // Logical thread id to thread idx (one-thread) using ThrID = Layout<_1>; @@ -108,23 +108,24 @@ copy_explode(PtrS&& s, int_sequence, } // end namespace detail // -// Generic copy_unpack for any Copy_Traits +// Generic copy_unpack for common argument-based Copy_Traits // -template + +template CUTE_HOST_DEVICE constexpr void -copy_unpack(Copy_Traits const&, - Tensor const& src, - Tensor & dst) +copy_unpack(Copy_Traits const&, + Tensor const& src, + Tensor & dst) { // Specializations can generalize on these checks - //static_assert(is_smem::value, "Expected smem for this Copy_Traits"); - //static_assert(is_rmem::value, "Expected rmem for this Copy_Traits"); + //static_assert(is_smem::value, "Expected smem for this Copy_Traits"); + //static_assert(is_rmem::value, "Expected rmem for this Copy_Traits"); - using RegistersSrc = typename Operation::SRegisters; - using RegistersDst = typename Operation::DRegisters; + using RegistersSrc = typename CopyOp::SRegisters; + using RegistersDst = typename CopyOp::DRegisters; using RegTypeSrc = typename remove_extent::type; using RegTypeDst = typename remove_extent::type; constexpr int RegNumSrc = extent::value; @@ -134,26 +135,26 @@ copy_unpack(Copy_Traits const&, Tensor rD = recast(dst); CUTE_STATIC_ASSERT_V(size(rS) == Int{}, - "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy."); + "Copy_Traits: src failed to vectorize into registers. Layout is incompatible with this CopyOp."); CUTE_STATIC_ASSERT_V(size(rD) == Int{}, - "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy."); + "Copy_Traits: dst failed to vectorize into registers. Layout is incompatible with this CopyOp."); - detail::copy_explode(rS, make_int_sequence{}, - rD, make_int_sequence{}); + detail::copy_explode(rS, make_int_sequence{}, + rD, make_int_sequence{}); } // // Accept mutable temporaries // -template +template CUTE_HOST_DEVICE constexpr void -copy_unpack(Copy_Traits const& traits, - Tensor const& src, - Tensor && dst) +copy_unpack(Copy_Traits const& traits, + Tensor const& src, + Tensor && dst) { copy_unpack(traits, src, dst); } diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp index d901ac5e..fb30e4f3 100644 --- a/include/cute/layout.hpp +++ b/include/cute/layout.hpp @@ -1184,11 +1184,10 @@ left_inverse(Underscore const& _) // /* Return a layout that points to the maximum number of contiguous elements - * that logically correspond in the layouts of @a a and @a b. This is, - * the elements that could reasonably be "vectorized" in the layouts. + * that logically correspond in the layouts of @a a and @a b. * * @returns Layout R - * @post For all 0 <= i < size(R), a(R(i)) == i && b(R(i)) == i + * @post For all 0 <= i < size(R), a(R(i)) == i and b(R(i)) == i */ template @@ -1200,8 +1199,7 @@ max_common_layout(Layout const& a, Layout inv_b = right_inverse(b); Layout common = coalesce(composition(a, inv_b)); - // NOTE: If one of the layouts is dynamic, we can't prove alignment+vectorization is valid - // We assume dynamic shapes/strides obey alignment requirements (i.e. are large and multiples of the vector) + // Keep only the static identity component of the common layout if constexpr (is_static(common))>::value && is_constant<1, decltype(stride<0>(common))>::value) { // Truncate to the size of the contiguous vector (static stride-1 mode) @@ -1212,11 +1210,11 @@ max_common_layout(Layout const& a, } /* Return Int such that N is the maximum number of contiguous elements - * that logically correspond in the layouts of @a a and @a b. This is, - * the number of elements that could reasonably be "vectorized" in the layouts. + * that logically correspond in the layouts of @a a and @a b. * * @returns Int with N >= 1 - * @post For all 0 <= n < N, a(b[n]) == n (NOTE: Problems with negative strides/coords in this post-condition) + * @post For all 0 <= n < N, a(b.get_1d_coord(n)) == n + * (NOTE: Problems with negative strides/coords in this post-condition) */ template @@ -1227,8 +1225,7 @@ max_common_vector(Layout const& a, { Layout common = coalesce(composition(a, right_inverse(b))); - // NOTE: If one of the layouts is dynamic, we can't prove alignment+vectorization is valid - // We assume dynamic shapes/strides obey alignment requirements (i.e. are large and multiples of the vector) + // Keep only the static identity component of the common layout if constexpr (is_static(common))>::value && is_constant<1, decltype(stride<0>(common))>::value) { // Truncate to the size of the contiguous vector (static stride-1 mode) diff --git a/include/cute/swizzle_layout.hpp b/include/cute/swizzle_layout.hpp index 1bbccd0c..164961b2 100644 --- a/include/cute/swizzle_layout.hpp +++ b/include/cute/swizzle_layout.hpp @@ -449,6 +449,30 @@ recast_layout(Swizzle const& swizzle) // Other operations // +template +CUTE_HOST_DEVICE constexpr +auto +max_common_layout(ComposedLayout,Offset,LayoutB> const& a, + Layout const& b) +{ + auto common = max_common_layout(a.layout_b(), b); + auto base = Int<(1 << M)>{}; + if constexpr (base < size(common)) { + return common.compose(base); // Truncate common to size base + } else { + return common; + } +} + +template +CUTE_HOST_DEVICE constexpr +auto +max_common_layout(Layout const& a, + ComposedLayout,Offset,LayoutB> const& b) +{ + return max_common_layout(b, a); +} + template CUTE_HOST_DEVICE constexpr auto diff --git a/include/cute/tensor.hpp b/include/cute/tensor.hpp index 9b4e744c..c25ce1d2 100644 --- a/include/cute/tensor.hpp +++ b/include/cute/tensor.hpp @@ -674,7 +674,7 @@ recast(Tensor&& tensor) // max_common_vector // -/* Return Int such that N is the maximum number of continguous elements +/* Return Int such that N is the maximum number of contiguous elements * that logically correspond in the tensors of @a a and @a b. This is, * the number of elements that could reasonably be vectorized into a single load/store. * @@ -682,6 +682,9 @@ recast(Tensor&& tensor) * * A return value of Int<0> indicates that no such conclusion can be made and no * vectorization should be attempted. + * + * Note that the return value does NOT include alignment concerns such as the pointer value and + * the divisbility of dynamic strides. */ template @@ -713,6 +716,46 @@ max_common_vector(Tensor const& a, CUTE_GCC_UNREACHABLE; } +/* Return a layout that points to the maximum number of contiguous elements + * that logically correspond in the tensors of @a a and @a b. This is, + * the elements that could reasonably be "vectorized" into a single load/store. + * + * @returns Layout R such that composition(a.layout(), R) and composition(b.layout(), R) + * are both identity Layouts. + * + * Note that the returned layout does NOT include alignment concerns such as the pointer value and + * the divisbility of dynamic strides. + */ +template +CUTE_HOST_DEVICE constexpr +auto +max_common_layout(Tensor const& a, + Tensor const& b) +{ + using SrcType = typename Tensor::value_type; + using DstType = typename Tensor::value_type; + using SrcRef = typename Tensor::reference; + using DstRef = typename Tensor::reference; + + // Determine if vectorization candidates at all + if constexpr (// Should be the same value_types, else the copy is also performing a cast + sizeof_bits_v == sizeof_bits_v && + // The types should be trivially copyable so that vectorization is valid + is_trivially_copyable::value && + is_trivially_copyable::value && + // Should be load/storing real data, rather than implicit iterators or such + is_reference::value && + is_reference::value) + { + return max_common_layout(a.layout(), b.layout()); + } else { + return Layout<_1,_0>{}; + } + + CUTE_GCC_UNREACHABLE; +} + // // Key algebraic operations -- Divide and Product // diff --git a/include/cutlass/cluster_launch.hpp b/include/cutlass/cluster_launch.hpp index 995030a7..28611d51 100644 --- a/include/cutlass/cluster_launch.hpp +++ b/include/cutlass/cluster_launch.hpp @@ -42,8 +42,8 @@ #if defined(__CUDACC_RTC__) #include #else -#include #include +#include #endif #if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))) diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl index c0401f6d..8580547b 100644 --- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl +++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl @@ -154,7 +154,7 @@ sm90_get_smem_store_op_for_accumulator() { } else { // auto-vectorizing store - return DefaultCopy{}; + return AutoVectorizingCopyWithAssumedAlignment{}; } } @@ -175,7 +175,7 @@ sm90_get_smem_load_op_for_source() { } else { // auto-vectorizing load - return DefaultCopy{}; + return AutoVectorizingCopyWithAssumedAlignment{}; } } diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp index 20e4118f..687cb293 100644 --- a/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp @@ -438,7 +438,7 @@ struct Sm90ReLUAuxStore { using VecType = uint_bit_t; Tensor tC_rAux_vec = recast(tC_rAux); Tensor tC_gAux_vec = recast(tC_gAux); - Tensor tC_cAux_vec = tC_cAux.compose(make_layout(Int{}, Int{})); + Tensor tC_cAux_vec = tC_cAux.compose(make_layout(Int{}, Int{})); // only works if vector is logically sequential auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_mn); }; copy_if(FunctionPredTensor(predicate_fn), tC_rAux_vec, tC_gAux_vec); } @@ -662,7 +662,7 @@ struct Sm90AuxLoad< } if (elem_less(repeat_like(residue_mn, _0{}), residue_mn)) { // (partially) in-bounds CTA tile - copy(tC_gAux, tC_rAux); + copy_aligned(tC_gAux, tC_rAux); } } } @@ -677,7 +677,7 @@ struct Sm90AuxLoad< } if (elem_less(repeat_like(residue_mn, _0{}), residue_mn)) { - copy(tC_gAux(_,_,_,epi_m,epi_n), tC_rAux); + copy_aligned(tC_gAux(_,_,_,epi_m,epi_n), tC_rAux); } } } diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp index e615447d..b60dc2c7 100644 --- a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp @@ -641,8 +641,9 @@ struct Sm90RowBroadcast { if (epi_m == 0) { // Assumes M-major subtile loop // Filter so we don't issue redundant copies over stride-0 modes + // (only works if 0-strides are in same location, which is by construction) int bcast_pipe_index = (load_iteration / EpiTiles) % Stages; - copy(filter(tCsRow(_,_,_,epi_m,epi_n,bcast_pipe_index)), filter(tCrRow)); + copy_aligned(filter(tCsRow(_,_,_,epi_m,epi_n,bcast_pipe_index)), filter(tCrRow)); } } @@ -774,7 +775,8 @@ struct Sm90ColBroadcast { } // Filter so we don't issue redundant copies over stride-0 modes - copy(filter(tCgCol), filter(tCrCol)); + // (only works if 0-strides are in same location, which is by construction) + copy_aligned(filter(tCgCol), filter(tCrCol)); } template diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp index 374309ee..2330c30b 100644 --- a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp @@ -915,8 +915,9 @@ public: using ElementGmem = conditional_t; Tensor tCgBuf = sm90_partition_for_epilogue(gBuf_nl(_,_,n,l), epi_tile, tiled_copy, thread_idx); if (is_reduced_lane) { - // Filter so we don't issue redunant copies over stride-0 modes - copy(filter(tCrCol), recast(filter(tCgBuf))); + // Filter so we don't issue redundant copies over stride-0 modes + // (only works if 0-strides are in same location, which is by construction) + copy_aligned(filter(tCrCol), recast(filter(tCgBuf))); } sync_fn(); } @@ -934,7 +935,8 @@ public: Tensor tCsBuf = sm90_partition_for_epilogue(sBuf(_,_,get<1>(warp_mn)), epi_tile, tiled_copy, thread_idx); if (is_reduced_lane) { // Filter so we don't issue redunant copies over stride-0 modes - copy(filter(tCrCol), filter(tCsBuf)); + // (only works if 0-strides are in same location, which is by construction) + copy_aligned(filter(tCrCol), filter(tCsBuf)); } sync_fn(); diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h index def07421..7f1c242d 100644 --- a/include/cutlass/fast_math.h +++ b/include/cutlass/fast_math.h @@ -231,7 +231,7 @@ int ceil_div(int a, int b) { * log2_up/down codes? */ template -CUTLASS_HOST_DEVICE int clz(value_t x) { +CUTLASS_HOST_DEVICE value_t clz(value_t x) { for (int i = 31; i >= 0; --i) { if ((1 << i) & x) return value_t(31 - i); diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h index f7ec68e8..100c7a3e 100644 --- a/include/cutlass/float8.h +++ b/include/cutlass/float8.h @@ -1,4 +1,4 @@ -/************************************************************************************************** +/*************************************************************************************************** * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * @@ -217,7 +217,7 @@ struct alignas(1) float8_base { // Extract the bits in the FP32 type uint8_t sign = uint8_t((s >> 24 & 0x80)); int32_t exp = int32_t((s >> FP32_NUM_MANTISSA_BITS) & 0xff) - FP32_EXPONENT_BIAS; - uint32_t mantissa = s & 0x7fffff; + int mantissa = s & 0x7fffff; uint8_t u = 0; uint8_t const kF8_NaN = 0x7f; diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h index 5e69ffb5..265eedfd 100644 --- a/include/cutlass/gemm/device/gemm_universal_base.h +++ b/include/cutlass/gemm/device/gemm_universal_base.h @@ -375,6 +375,7 @@ public: } }; + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Static initializers ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp index ec278da8..584aa58e 100644 --- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp +++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp @@ -347,7 +347,7 @@ public: // The number of tiles for which reduction is required is either: // (a) the total number of output tiles (in the case of split-K) // (b) the number of stream-K tiles - // To calcualte the total number of output tiles in the split-K case, we + // To calculate the total number of output tiles in the split-K case, we // note that, in the split-K case, the units_per_problem_ member of Params will be // the total number of output tiles. auto reduction_tiles = params.splits_ > 1 ? params.units_per_problem_ : params.sk_tiles_; diff --git a/pyproject.toml b/pyproject.toml index 03d38dcd..9b9224b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ dependencies = [ "networkx", "numpy", "pydot", - "rmm-cu12 ; python_version>='3.9'", "scipy", "treelib" ] diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py index 20a2181a..6efc7659 100644 --- a/python/cutlass_library/manifest.py +++ b/python/cutlass_library/manifest.py @@ -181,9 +181,8 @@ void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Ma """ self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" self.configuration_template = " initialize_${configuration_name}(manifest);\n" - self.subclass_prototype_template = "void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n" self.subclass_call_template = " initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n" - + self.subclass_prototype_template = "void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n" self.epilogue_template ="""} /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -251,7 +250,6 @@ void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Ma # def __exit__(self, exception_type, exception_value, traceback): - for subclass_name, subclass_file in sorted(self.subclass_files.items()): subclass_cfg = { 'min_cc': str(self.min_cc), diff --git a/setup.cfg b/setup.cfg index 78222b8c..c996eed4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,4 +30,4 @@ include_package_data = True cutlass_library.source = include/**/*, examples/**/*, tools/**/* [options.exclude_package_data] -cutlass_library.source = include/**/*.py, examples/**/*.py, tools/**/*.py \ No newline at end of file +cutlass_library.source = include/**/*.py, examples/**/*.py, tools/**/*.py diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h index 7054ce98..61f6ff73 100644 --- a/test/unit/conv/device/conv2d_testbed.h +++ b/test/unit/conv/device/conv2d_testbed.h @@ -742,7 +742,7 @@ bool TestAllConv2d( } // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h index 3093525a..fe57ec85 100644 --- a/test/unit/conv/device/conv2d_testbed_interleaved.h +++ b/test/unit/conv/device/conv2d_testbed_interleaved.h @@ -609,7 +609,7 @@ bool TestAllInterleavedConv2d( #if 0 // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h index dc2297fc..d1a1e666 100644 --- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h +++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h @@ -632,7 +632,7 @@ bool TestAllConv2dWithBroadcast( // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h index 7973870a..7c573f06 100644 --- a/test/unit/conv/device/conv2d_with_reduction_testbed.h +++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h @@ -587,7 +587,7 @@ bool TestAllConv2dWithReduction( // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( {1, 17, 11, 288}, // input size (NHWC) diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h index 577c84f3..00c2eb1f 100644 --- a/test/unit/conv/device/conv3d_testbed.h +++ b/test/unit/conv/device/conv3d_testbed.h @@ -613,7 +613,7 @@ bool TestAllConv3d( // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters - // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep + // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep // alpha and beta for local testing, but only runs one value for alpha and beta. cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size ( {1, 8, 8, 8, 32}, // input size (NDHWC) diff --git a/test/unit/cute/CMakeLists.txt b/test/unit/cute/CMakeLists.txt index 16e5df3b..855d3162 100644 --- a/test/unit/cute/CMakeLists.txt +++ b/test/unit/cute/CMakeLists.txt @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add_subdirectory(core) +add_subdirectory(volta) add_subdirectory(ampere) add_subdirectory(hopper) add_subdirectory(layout) @@ -37,6 +38,7 @@ add_custom_target( DEPENDS cutlass_test_unit_cute_layout cutlass_test_unit_cute_core + cutlass_test_unit_cute_volta cutlass_test_unit_cute_ampere cutlass_test_unit_cute_hopper cutlass_test_unit_cute_msvc_compilation @@ -47,6 +49,7 @@ add_custom_target( DEPENDS test_unit_cute_layout test_unit_cute_core + test_unit_cute_volta test_unit_cute_ampere test_unit_cute_hopper test_unit_cute_msvc_compilation diff --git a/test/unit/cute/hopper/bulk_load.cu b/test/unit/cute/hopper/bulk_load.cu index 7f93c29f..ab8b270e 100644 --- a/test/unit/cute/hopper/bulk_load.cu +++ b/test/unit/cute/hopper/bulk_load.cu @@ -52,11 +52,11 @@ struct SharedStorage { #if CUDA_12_0_SM90_FEATURES_SUPPORTED template -__global__ void -bulk_copy_test_device_cute(T const* g_in, - T * g_out, - GmemLayout gmem_layout, - SmemLayout smem_layout) +__global__ void +bulk_copy_test_device_cute(T const* g_in, + T * g_out, + GmemLayout gmem_layout, + SmemLayout smem_layout) { // Use Shared Storage structure to allocate and distribute aligned SMEM addresses extern __shared__ char shared_memory[]; @@ -75,7 +75,7 @@ bulk_copy_test_device_cute(T const* g_in, // Perform the BULK_COPY load // - auto atom = Copy_Atom{}; + auto blkcp = Copy_Traits{}; #if 0 if (thread0()) { @@ -93,7 +93,7 @@ bulk_copy_test_device_cute(T const* g_in, initialize_barrier(bulk_copy_mbar[0], 1 /*numThreads*/); set_barrier_transaction_bytes(bulk_copy_mbar[0], transaction_bytes); - copy(atom.with(bulk_copy_mbar[0]), gA, sA); + copy(blkcp.with(bulk_copy_mbar[0]), gA, sA); } __syncthreads(); @@ -121,11 +121,11 @@ bulk_copy_test_device_cute(T const* g_in, template void run_and_validate(GLayout gmem_layout, - SLayout smem_layout) + SLayout smem_layout) { thrust::host_vector h_in(cosize(gmem_layout)); - for (int32_t i = 0; i < h_in.size(); ++i) { - h_in[i] = T(i); + for (int32_t i = 0; i < h_in.size(); ++i) { + h_in[i] = T(i); } thrust::device_vector d_in = h_in; @@ -148,9 +148,9 @@ void run_and_validate(GLayout gmem_layout, // } // namespace -TEST(SM90_CuTe_BLKCP, ColMajor) +TEST(SM90_CuTe_BLKCP, ColMajor) { - + auto smem_layout = make_layout(Shape<_32,_32>{}, GenColMajor{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); @@ -158,9 +158,9 @@ TEST(SM90_CuTe_BLKCP, ColMajor) run_and_validate(gmem_layout, smem_layout); } -TEST(SM90_CuTe_BLKCP, RowMajor) +TEST(SM90_CuTe_BLKCP, RowMajor) { - + auto smem_layout = make_layout(Shape<_32,_32>{}, GenRowMajor{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); @@ -168,9 +168,9 @@ TEST(SM90_CuTe_BLKCP, RowMajor) run_and_validate(gmem_layout, smem_layout); } -TEST(SM90_CuTe_BLKCP, NonCompact) +TEST(SM90_CuTe_BLKCP, NonCompact) { - + { auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{}); auto gmem_layout = smem_layout; diff --git a/test/unit/cute/hopper/bulk_store.cu b/test/unit/cute/hopper/bulk_store.cu index 13324b63..ad254fb9 100644 --- a/test/unit/cute/hopper/bulk_store.cu +++ b/test/unit/cute/hopper/bulk_store.cu @@ -51,11 +51,11 @@ struct SharedStorage { #if CUDA_12_0_SM90_FEATURES_SUPPORTED template -__global__ void -bulk_copy_test_device_cute(T const* g_in, - T * g_out, - GmemLayout gmem_layout, - SmemLayout smem_layout) +__global__ void +bulk_copy_test_device_cute(T const* g_in, + T * g_out, + GmemLayout gmem_layout, + SmemLayout smem_layout) { // Use Shared Storage structure to allocate and distribute aligned SMEM addresses extern __shared__ char shared_memory[]; @@ -93,9 +93,9 @@ bulk_copy_test_device_cute(T const* g_in, Tensor gA_out = make_tensor(make_gmem_ptr(g_out), gmem_layout); - auto atom = Copy_Atom, uint8_t>{}; + auto blkcp = Copy_Traits{}; - copy(atom, sA, gA_out); + copy(blkcp, sA, gA_out); // Bulk Copy store requires the same sync as TMA store. tma_store_arrive(); tma_store_wait<0>(); @@ -103,11 +103,11 @@ bulk_copy_test_device_cute(T const* g_in, template void run_and_validate(GLayout gmem_layout, - SLayout smem_layout) + SLayout smem_layout) { thrust::host_vector h_in(cosize(gmem_layout)); - for (int32_t i = 0; i < h_in.size(); ++i) { - h_in[i] = T(i); + for (int32_t i = 0; i < h_in.size(); ++i) { + h_in[i] = T(i); } thrust::device_vector d_in = h_in; @@ -130,9 +130,8 @@ void run_and_validate(GLayout gmem_layout, // } // namespace -TEST(SM90_CuTe_BLKCP, ColMajor) +TEST(SM90_CuTe_BLKCP, ColMajor) { - auto smem_layout = make_layout(Shape<_32,_32>{}, GenColMajor{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); @@ -140,9 +139,8 @@ TEST(SM90_CuTe_BLKCP, ColMajor) run_and_validate(gmem_layout, smem_layout); } -TEST(SM90_CuTe_BLKCP, RowMajor) +TEST(SM90_CuTe_BLKCP, RowMajor) { - auto smem_layout = make_layout(Shape<_32,_32>{}, GenRowMajor{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); @@ -150,9 +148,8 @@ TEST(SM90_CuTe_BLKCP, RowMajor) run_and_validate(gmem_layout, smem_layout); } -TEST(SM90_CuTe_BLKCP, NonCompact) +TEST(SM90_CuTe_BLKCP, NonCompact) { - { auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{}); auto gmem_layout = smem_layout; diff --git a/test/unit/cute/volta/CMakeLists.txt b/test/unit/cute/volta/CMakeLists.txt new file mode 100644 index 00000000..36a8c581 --- /dev/null +++ b/test/unit/cute/volta/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cutlass_test_unit_add_executable( + cutlass_test_unit_cute_volta + vectorization_auto.cu +) diff --git a/test/unit/cute/volta/vectorization_auto.cu b/test/unit/cute/volta/vectorization_auto.cu new file mode 100644 index 00000000..80df0329 --- /dev/null +++ b/test/unit/cute/volta/vectorization_auto.cu @@ -0,0 +1,132 @@ + +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass_unit_test.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +using namespace cute; + +template +__global__ +void +kernel(GmemTensor gC, RmemTiler tiler, CopyPolicy policy) +{ + Tensor tCgC = local_tile(gC, tiler, 0); + + Tensor rC = make_tensor_like(tCgC); + using T = typename GmemTensor::value_type; + for (int i = 0; i < size(rC); ++i) { + rC(i) = T(i % 13); + } + +#if 0 + print(" gC : "); print( gC); print("\n"); + print("tCgC : "); print(tCgC); print("\n"); + print(" rC : "); print( rC); print("\n"); +#endif + + // NOTE: only 1 thread, this thread produce a block of 8x8 output. The fringe will not be touched. + //copy(rC, tCgC); // Enable auto-vectorization if static + //copy_vec(rC, tCgC); // Disable auto-vectorization always + copy(policy, rC, tCgC); // Use a policy to establish vectorization assumptions +} + +template +void +test_copy_vectorization(CopyPolicy policy, GmemLayout gmem_layout, RmemTiler rmem_tiler) +{ + thrust::host_vector h_in(cosize(gmem_layout), T(0)); + + thrust::device_vector d_in = h_in; + Tensor m_in = make_tensor(make_gmem_ptr(raw_pointer_cast(d_in.data())), gmem_layout); + + kernel<<<1,1>>>(m_in, rmem_tiler, policy); + + thrust::host_vector h_out = d_in; + Tensor result = make_tensor(h_out.data(), gmem_layout); + + thrust::host_vector h_true = h_in; + Tensor ref = make_tensor(h_true.data(), gmem_layout); + + // Set the values directly in the reference tensor, no copy + Tensor ref_tile = local_tile(ref, rmem_tiler, 0); + for (int i = 0; i < size(ref_tile); ++i) { + ref_tile(i) = T(i % 13); + } + + // Compare the reference and the result. Print only the first 3 errors. + // print_tensor(result); + int count = 3; + for (int i = 0; i < size(ref) && count > 0; ++i) { + EXPECT_EQ(result(i), ref(i)); + if (result(i) != ref(i)) { + --count; + } + } +} + +template +void +test_copy_vectorization(GmemLayout gmem_layout, RmemTiler rmem_tiler) +{ + test_copy_vectorization(DefaultCopy{}, gmem_layout, rmem_tiler); +} + +TEST(SM70_CuTe_Volta, SimpleVec) +{ + // Fully static layouts are assumed to be aligned -- these will be vectorized + test_copy_vectorization(make_layout(make_shape(Int<8>{}, Int<8>{})), Shape<_8,_8>{}); + test_copy_vectorization(make_layout(make_shape(Int<12>{}, Int<12>{})), Shape<_8,_8>{}); + // Fails in vectorization recast due to misalignment and static assertions + //test_copy_vectorization(make_layout(make_shape(Int<9>{}, Int<9>{})), Shape<_8,_8>{}); + + // Dynamic layouts are not assumed to be aligned -- these will not be vectorized + test_copy_vectorization(make_layout(make_shape(12,12)), Shape<_8,_8>{}); + test_copy_vectorization(make_layout(make_shape( 9, 9)), Shape<_8,_8>{}); + + // Dynamic layouts that are assumed to be aligned -- these will be vectorized + test_copy_vectorization(AutoVectorizingCopyWithAssumedAlignment<128>{}, make_layout(make_shape( 8, 8)), Shape<_8,_8>{}); + test_copy_vectorization(AutoVectorizingCopyWithAssumedAlignment<128>{}, make_layout(make_shape(12,12)), Shape<_8,_8>{}); + // Fails -- bad alignment assumption + //test_copy_vectorization(AutoVectorizingCopyWithAssumedAlignment<128>{}, make_layout(make_shape( 9, 9)), Shape<_8,_8>{}); +}