Updates and Bug fixes to CUTLASS 3.3 (#1232)
This commit is contained in:
@ -353,7 +353,7 @@ struct ThrCopy
|
||||
template <class STensor>
|
||||
CUTE_HOST_DEVICE
|
||||
auto
|
||||
partition_S(STensor&& stensor) {
|
||||
partition_S(STensor&& stensor) const {
|
||||
//static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
|
||||
// "Expected ValType for tiling SrcTensor.");
|
||||
auto thr_tensor = make_tensor(std::forward<STensor>(stensor).data(), TiledCopy::tidfrg_S(stensor.layout()));
|
||||
@ -363,7 +363,7 @@ struct ThrCopy
|
||||
template <class DTensor>
|
||||
CUTE_HOST_DEVICE
|
||||
auto
|
||||
partition_D(DTensor&& dtensor) {
|
||||
partition_D(DTensor&& dtensor) const {
|
||||
//static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
|
||||
// "Expected ValType for tiling DstTensor.");
|
||||
auto thr_tensor = make_tensor(std::forward<DTensor>(dtensor).data(), TiledCopy::tidfrg_D(dtensor.layout()));
|
||||
@ -479,10 +479,10 @@ make_tiled_copy_C_atom(Copy_Atom<Args...> const& copy_atom,
|
||||
return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
|
||||
}
|
||||
|
||||
/** Produce a TiledCopy from logical thread and values layouts.
|
||||
* The thread and value layouts map coordinates to thr_idx and val_idx.
|
||||
/** Produce a TiledCopy from logical thread and values layouts.
|
||||
* The thread and value layouts map coordinates to thr_idx and val_idx.
|
||||
* The product of these layouts is taken to produce the TV layout and the Tiler.
|
||||
* Useful when threads and values need very specific mappings onto coordinates
|
||||
* Useful when threads and values need very specific mappings onto coordinates
|
||||
* in the target tensors.
|
||||
*/
|
||||
template <class... Args,
|
||||
@ -510,16 +510,16 @@ make_tiled_copy(Copy_Atom<Args...> const& copy_atom,
|
||||
return make_tiled_copy_impl(copy_atom, layout_tv, product_each(shape(layout_mn)));
|
||||
}
|
||||
|
||||
/** Produce a TiledCopy from thread and value offset maps.
|
||||
/** Produce a TiledCopy from thread and value offset maps.
|
||||
* The TV Layout maps threads and values to the codomain of the data_layout.
|
||||
* It is verified that the intended codomain is valid within data_layout.
|
||||
* It is verified that the intended codomain is valid within data_layout.
|
||||
* Useful when threads and values don't care about owning specific coordinates, but
|
||||
* care more about the vector-width and offsets between them.
|
||||
*/
|
||||
template <class... Args, class AtomTVLayout, class DataLayout>
|
||||
CUTE_HOST_DEVICE constexpr
|
||||
auto
|
||||
make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
|
||||
make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
|
||||
AtomTVLayout const& atom_tv_layout, // atom (thr,val) -> data addr
|
||||
DataLayout const& data_layout) // coord -> data addr The target layout
|
||||
{
|
||||
|
||||
@ -59,7 +59,7 @@ namespace cute
|
||||
template <class CopyOperation, class... CopyOpArgs>
|
||||
struct Copy_Traits
|
||||
{
|
||||
static_assert(sizeof(CopyOperation) == 0, "Copy_Traits not implemented for this Copy_Operation.");
|
||||
static_assert(dependent_false<CopyOperation>, "Copy_Traits not implemented for this CopyOperation.");
|
||||
};
|
||||
|
||||
template <class S, class D>
|
||||
@ -77,8 +77,8 @@ struct Copy_Traits<UniversalCopy<S,D>>
|
||||
using RefLayout = SrcLayout;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Copy_Traits<DefaultCopy>
|
||||
template <int MaxVecBits>
|
||||
struct Copy_Traits<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>>
|
||||
{
|
||||
// Logical thread id to thread idx (one-thread)
|
||||
using ThrID = Layout<_1>;
|
||||
@ -108,23 +108,24 @@ copy_explode(PtrS&& s, int_sequence<Is...>,
|
||||
} // end namespace detail
|
||||
|
||||
//
|
||||
// Generic copy_unpack for any Copy_Traits
|
||||
// Generic copy_unpack for common argument-based Copy_Traits
|
||||
//
|
||||
template <class Operation, class... Args,
|
||||
class TS, class SLayout,
|
||||
class TD, class DLayout>
|
||||
|
||||
template <class CopyOp, class... Args,
|
||||
class SEngine, class SLayout,
|
||||
class DEngine, class DLayout>
|
||||
CUTE_HOST_DEVICE constexpr
|
||||
void
|
||||
copy_unpack(Copy_Traits<Operation, Args...> const&,
|
||||
Tensor<TS,SLayout> const& src,
|
||||
Tensor<TD,DLayout> & dst)
|
||||
copy_unpack(Copy_Traits<CopyOp,Args...> const&,
|
||||
Tensor<SEngine,SLayout> const& src,
|
||||
Tensor<DEngine,DLayout> & dst)
|
||||
{
|
||||
// Specializations can generalize on these checks
|
||||
//static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<Operation>");
|
||||
//static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<Operation>");
|
||||
//static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<CopyOp>");
|
||||
//static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<CopyOp>");
|
||||
|
||||
using RegistersSrc = typename Operation::SRegisters;
|
||||
using RegistersDst = typename Operation::DRegisters;
|
||||
using RegistersSrc = typename CopyOp::SRegisters;
|
||||
using RegistersDst = typename CopyOp::DRegisters;
|
||||
using RegTypeSrc = typename remove_extent<RegistersSrc>::type;
|
||||
using RegTypeDst = typename remove_extent<RegistersDst>::type;
|
||||
constexpr int RegNumSrc = extent<RegistersSrc>::value;
|
||||
@ -134,26 +135,26 @@ copy_unpack(Copy_Traits<Operation, Args...> const&,
|
||||
Tensor rD = recast<RegTypeDst>(dst);
|
||||
|
||||
CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
|
||||
"In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
|
||||
"Copy_Traits: src failed to vectorize into registers. Layout is incompatible with this CopyOp.");
|
||||
CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
|
||||
"In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
|
||||
"Copy_Traits: dst failed to vectorize into registers. Layout is incompatible with this CopyOp.");
|
||||
|
||||
detail::copy_explode<Operation>(rS, make_int_sequence<RegNumSrc>{},
|
||||
rD, make_int_sequence<RegNumDst>{});
|
||||
detail::copy_explode<CopyOp>(rS, make_int_sequence<RegNumSrc>{},
|
||||
rD, make_int_sequence<RegNumDst>{});
|
||||
}
|
||||
|
||||
//
|
||||
// Accept mutable temporaries
|
||||
//
|
||||
|
||||
template <class Operation, class... Args,
|
||||
class TS, class SLayout,
|
||||
class TD, class DLayout>
|
||||
template <class CopyOp, class... Args,
|
||||
class SEngine, class SLayout,
|
||||
class DEngine, class DLayout>
|
||||
CUTE_HOST_DEVICE constexpr
|
||||
void
|
||||
copy_unpack(Copy_Traits<Operation, Args...> const& traits,
|
||||
Tensor<TS,SLayout> const& src,
|
||||
Tensor<TD,DLayout> && dst)
|
||||
copy_unpack(Copy_Traits<CopyOp,Args...> const& traits,
|
||||
Tensor<SEngine,SLayout> const& src,
|
||||
Tensor<DEngine,DLayout> && dst)
|
||||
{
|
||||
copy_unpack(traits, src, dst);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user