Updates and Bug fixes to CUTLASS 3.3 (#1232)

This commit is contained in:
Pradeep Ramani
2023-12-05 06:50:49 -08:00
committed by GitHub
parent 4a1709e17e
commit e9e30c2304
31 changed files with 534 additions and 227 deletions

View File

@ -353,7 +353,7 @@ struct ThrCopy
template <class STensor>
CUTE_HOST_DEVICE
auto
partition_S(STensor&& stensor) {
partition_S(STensor&& stensor) const {
//static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
// "Expected ValType for tiling SrcTensor.");
auto thr_tensor = make_tensor(std::forward<STensor>(stensor).data(), TiledCopy::tidfrg_S(stensor.layout()));
@ -363,7 +363,7 @@ struct ThrCopy
template <class DTensor>
CUTE_HOST_DEVICE
auto
partition_D(DTensor&& dtensor) {
partition_D(DTensor&& dtensor) const {
//static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
// "Expected ValType for tiling DstTensor.");
auto thr_tensor = make_tensor(std::forward<DTensor>(dtensor).data(), TiledCopy::tidfrg_D(dtensor.layout()));
@ -479,10 +479,10 @@ make_tiled_copy_C_atom(Copy_Atom<Args...> const& copy_atom,
return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
}
/** Produce a TiledCopy from logical thread and values layouts.
* The thread and value layouts map coordinates to thr_idx and val_idx.
/** Produce a TiledCopy from logical thread and values layouts.
* The thread and value layouts map coordinates to thr_idx and val_idx.
* The product of these layouts is taken to produce the TV layout and the Tiler.
* Useful when threads and values need very specific mappings onto coordinates
* Useful when threads and values need very specific mappings onto coordinates
* in the target tensors.
*/
template <class... Args,
@ -510,16 +510,16 @@ make_tiled_copy(Copy_Atom<Args...> const& copy_atom,
return make_tiled_copy_impl(copy_atom, layout_tv, product_each(shape(layout_mn)));
}
/** Produce a TiledCopy from thread and value offset maps.
/** Produce a TiledCopy from thread and value offset maps.
* The TV Layout maps threads and values to the codomain of the data_layout.
* It is verified that the intended codomain is valid within data_layout.
* It is verified that the intended codomain is valid within data_layout.
* Useful when threads and values don't care about owning specific coordinates, but
* care more about the vector-width and offsets between them.
*/
template <class... Args, class AtomTVLayout, class DataLayout>
CUTE_HOST_DEVICE constexpr
auto
make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
AtomTVLayout const& atom_tv_layout, // atom (thr,val) -> data addr
DataLayout const& data_layout) // coord -> data addr The target layout
{

View File

@ -59,7 +59,7 @@ namespace cute
template <class CopyOperation, class... CopyOpArgs>
struct Copy_Traits
{
static_assert(sizeof(CopyOperation) == 0, "Copy_Traits not implemented for this Copy_Operation.");
static_assert(dependent_false<CopyOperation>, "Copy_Traits not implemented for this CopyOperation.");
};
template <class S, class D>
@ -77,8 +77,8 @@ struct Copy_Traits<UniversalCopy<S,D>>
using RefLayout = SrcLayout;
};
template <>
struct Copy_Traits<DefaultCopy>
template <int MaxVecBits>
struct Copy_Traits<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>>
{
// Logical thread id to thread idx (one-thread)
using ThrID = Layout<_1>;
@ -108,23 +108,24 @@ copy_explode(PtrS&& s, int_sequence<Is...>,
} // end namespace detail
//
// Generic copy_unpack for any Copy_Traits
// Generic copy_unpack for common argument-based Copy_Traits
//
template <class Operation, class... Args,
class TS, class SLayout,
class TD, class DLayout>
template <class CopyOp, class... Args,
class SEngine, class SLayout,
class DEngine, class DLayout>
CUTE_HOST_DEVICE constexpr
void
copy_unpack(Copy_Traits<Operation, Args...> const&,
Tensor<TS,SLayout> const& src,
Tensor<TD,DLayout> & dst)
copy_unpack(Copy_Traits<CopyOp,Args...> const&,
Tensor<SEngine,SLayout> const& src,
Tensor<DEngine,DLayout> & dst)
{
// Specializations can generalize on these checks
//static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<Operation>");
//static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<Operation>");
//static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<CopyOp>");
//static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<CopyOp>");
using RegistersSrc = typename Operation::SRegisters;
using RegistersDst = typename Operation::DRegisters;
using RegistersSrc = typename CopyOp::SRegisters;
using RegistersDst = typename CopyOp::DRegisters;
using RegTypeSrc = typename remove_extent<RegistersSrc>::type;
using RegTypeDst = typename remove_extent<RegistersDst>::type;
constexpr int RegNumSrc = extent<RegistersSrc>::value;
@ -134,26 +135,26 @@ copy_unpack(Copy_Traits<Operation, Args...> const&,
Tensor rD = recast<RegTypeDst>(dst);
CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
"In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
"Copy_Traits: src failed to vectorize into registers. Layout is incompatible with this CopyOp.");
CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
"In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
"Copy_Traits: dst failed to vectorize into registers. Layout is incompatible with this CopyOp.");
detail::copy_explode<Operation>(rS, make_int_sequence<RegNumSrc>{},
rD, make_int_sequence<RegNumDst>{});
detail::copy_explode<CopyOp>(rS, make_int_sequence<RegNumSrc>{},
rD, make_int_sequence<RegNumDst>{});
}
//
// Accept mutable temporaries
//
template <class Operation, class... Args,
class TS, class SLayout,
class TD, class DLayout>
template <class CopyOp, class... Args,
class SEngine, class SLayout,
class DEngine, class DLayout>
CUTE_HOST_DEVICE constexpr
void
copy_unpack(Copy_Traits<Operation, Args...> const& traits,
Tensor<TS,SLayout> const& src,
Tensor<TD,DLayout> && dst)
copy_unpack(Copy_Traits<CopyOp,Args...> const& traits,
Tensor<SEngine,SLayout> const& src,
Tensor<DEngine,DLayout> && dst)
{
copy_unpack(traits, src, dst);
}