Files
cutlass/include/cute/atom/copy_traits_sm100.hpp
Junkai-Wu a49a78ffef v4.2 release. (#2587)
* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line.

* v4.2 release.
2025-08-22 18:11:24 -04:00

3799 lines
143 KiB
C++

/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <cute/arch/copy_sm100.hpp>
#include <cute/arch/tmem_allocator_sm100.hpp>
#include <cute/atom/copy_traits.hpp>
#include <cute/atom/copy_atom.hpp>
#include <cute/atom/partitioner.hpp>
#include <cute/numeric/numeric_types.hpp>
#include <cute/layout.hpp>
namespace cute
{
template <>
struct Copy_Traits<SM100_LOAD_256bit_CACHE_NOALLOCATION>
{
// Logical thread id to thread idx (one-thread)
using ThrID = Layout<_1>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape<_1,_256>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape<_1,_256>>;
// Reference map from (thr,val) to bit
using RefLayout = SrcLayout;
};
template <>
struct Copy_Traits<SM100_STORE_256bit_CACHE_NOALLOCATION>
{
// Logical thread id to thread idx (one-thread)
using ThrID = Layout<_1>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape<_1,_256>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape<_1,_256>>;
// Reference map from (thr,val) to bit
using RefLayout = SrcLayout;
};
template <>
struct Copy_Traits<SM100_U8x8_LDSM_T>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _2, _2, _4,_2>,_128>,
Stride<Stride<_128,_1024,_256,_0>, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <Shape < _4,_8>,Shape <_8, _2, _2, _2>>,
Stride<Stride<_256,_8>,Stride<_1,_128,_64,_1024>>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_U8x16_LDSM_T>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _2, _2, _4, _2>,_128>,
Stride<Stride<_128,_1024,_256,_2048>, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <Shape < _4,_8>,Shape <_8, _2, _2, _4>>,
Stride<Stride<_256,_8>,Stride<_1,_128,_64,_1024>>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_SU4_DU8x16_x1_LDSM_N>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _8,_4>,_128>,
Stride<Stride<_128,_0>, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <_32,_32>,
Stride<_32, _1>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_SU6_DU8x16_x1_LDSM_N>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _8,_4>,_128>,
Stride<Stride<_128,_0>, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <_32,_32>,
Stride<_32, _1>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_SU4_DU8x16_x2_LDSM_N>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
Stride<Stride<_128,_0>, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <_32,Shape <_32, _2>>,
Stride<_32,Stride< _1,_1024>>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_SU6_DU8x16_x2_LDSM_N>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
Stride<Stride<_128,_0>, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <_32,Shape <_32, _2>>,
Stride<_32,Stride< _1,_1024>>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_SU4_DU8x16_x4_LDSM_N>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape < _32,_128>,
Stride<_128, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <_32,Shape <_32, _4>>,
Stride<_32,Stride< _1,_1024>>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_SU6_DU8x16_x4_LDSM_N>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape < _32,_128>,
Stride<_128, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <_32,Shape <_32, _4>>,
Stride<_32,Stride< _1,_1024>>>;
// Reference map from (thr,val) to bit
using RefLayout = DstLayout;
};
template <>
struct Copy_Traits<SM100_U8x4_STSM_T>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _4,_8>,Shape <_8, _2, _2>>,
Stride<Stride<_256,_8>,Stride<_1,_128,_64>>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <Shape < _8,_4>,_128>,
Stride<Stride<_128,_0>, _1>>;
// Reference map from (thr,val) to bit
using RefLayout = SrcLayout;
};
template <>
struct Copy_Traits<SM100_U8x8_STSM_T>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _4,_8>,Shape <_8, _2, _2, _2>>,
Stride<Stride<_256,_8>,Stride<_1,_128,_64,_1024>>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <Shape < _16,_2>,_128>,
Stride<Stride<_128,_0>, _1>>;
// Reference map from (thr,val) to bit
using RefLayout = SrcLayout;
};
template <>
struct Copy_Traits<SM100_U8x16_STSM_T>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <Shape < _4,_8>,Shape <_8, _2, _2, _4>>,
Stride<Stride<_256,_8>,Stride<_1,_128,_64,_1024>>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape < _32,_128>,
Stride<_128, _1>>;
// Reference map from (thr,val) to bit
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// TMEM Traits and Utilities
//
////////////////////////////////////////////////////////////////////////////////////////////////////
template <class... Args>
struct Copy_Atom;
/** Generate a TiledCopy from a CopyAtom and a TMEM tensor
* Example:
* Tensor gmem_tensor = ... // (M,N,...)
* Tensor tmem_tensor = ... // (M,N,...)
* auto tiled_tmem_load = make_tmem_copy(TMEM_LOAD_Operation, tmem_tensor);
* auto thr_tmem_load = tiled_tmem_load.get_slice(thread_idx);
*
* Tensor tDtC = thr_tmem_load.partition_S(tmem_tensor); // (TMEM_LOAD,TMEM_LOAD_M,TMEM_LOAD_N,...)
* Tensor tDgC = thr_tmem_load.partition_D(gmem_tensor); // (TMEM_LOAD,TMEM_LOAD_M,TMEM_LOAD_N,...)
* Tensor tDrC = make_tensor<ElementAccumulator>(shape(tDgD)); // (TMEM_LOAD,TMEM_LOAD_M,TMEM_LOAD_N,...)
*
* copy(tiled_tmem_load, tDtC, tDrC); // tmem -> rmem
* copy(tDrC, tDgC); // rmem -> gmem
*/
template <class CopyOp, class CopyT,
class TEngine, class TLayout>
CUTE_HOST_DEVICE constexpr
auto
make_tmem_copy(Copy_Atom<CopyOp,CopyT> const& atom,
Tensor<TEngine,TLayout> const& tmem)
{
static_assert(is_tmem<TEngine>::value, "Expected TMEM tensor.");
using T = typename TEngine::value_type;
using Traits = typename Copy_Atom<CopyOp, CopyT>::Traits;
static_assert(sizeof_bits_v<CopyT> == sizeof_bits_v<T>,
"Expected a CopyAtom with the same type-width as the Tensor.");
// atom thr idx -> tmem addr 4warps where each warp points to the same position within it's own subpartition
auto atom_t_layout = Layout<Shape<_32,_4>, Stride<_0, decltype(Int<32>{} * TMEM::DP<T>{})>>{};
// atom val idx -> tmem addr Cast the CopyOp's value ids to the proper data width
auto atom_v_layout = coalesce(upcast<sizeof_bits<T>::value>(typename Traits::ValID{}));
return make_cotiled_copy(atom, make_layout(atom_t_layout, atom_v_layout), tmem.layout());
}
template <class CopyOp,
class TEngine, class TLayout>
CUTE_HOST_DEVICE constexpr
auto
make_tmem_copy(CopyOp const&,
Tensor<TEngine,TLayout> const& tmem)
{
return make_tmem_copy(Copy_Atom<CopyOp, typename TEngine::value_type>{}, tmem);
}
/** Generate a TV_Tiler from a TMEM tensor
* Example:
* Tensor gmem_tensor = ... // (M,N,...)
* Tensor tmem_tensor = ... // (M,N,...)
* auto tmem_tiler = make_tmem_warp_partitioner(tmem_tensor);
* auto warp_tiler = tmem_tiler.get_slice(warp_idx);
*
* Tensor tWtC = warp_tiler.partition(tmem_tensor); // (WARP_M,WARP_N,...)
* Tensor tWgC = warp_tiler.partition(gmem_tensor); // (WARP_M,WARP_N,...)
*/
template <class TEngine, class TLayout>
CUTE_HOST_DEVICE constexpr
auto
make_tmem_warp_partitioner(Tensor<TEngine,TLayout> const& tmem)
{
static_assert(is_tmem<TEngine>::value, "Expected TMEM tensor.");
using T = typename TEngine::value_type;
// warp idx -> tmem addr This is the T in the Layout_TV
auto atom_t_layout = Layout<_4, decltype(Int<32>{} * TMEM::DP<T>{})>{};
// tmem coord -> tmem addr
auto tmem_layout = tmem.layout();
// tmem addr -> tmem coord Append 1:0 so off-the-ends get the stride-0
auto inv_tmem_layout = make_layout(left_inverse(tmem_layout), Layout<_1,_0>{});
// wid -> tmem_coord
auto layout_t_tmem = composition(inv_tmem_layout, atom_t_layout);
//
// Tiler -- Find the active elements in the TMEM tensor and generate a tiler to extract them
//
// Convert to the awkward by-mode tiler to preserve the modes of the tiled TMEM
auto flat_tmem_shape = product_each(shape(tmem_layout));
auto flat_tmem_zeros = repeat<rank(flat_tmem_shape)>(Int<0>{});
auto tiler = transform(make_seq<rank(flat_tmem_shape)>{}, [&](auto i) {
return filter(composition(make_layout(flat_tmem_shape, replace<i>(flat_tmem_zeros, Int<1>{})), layout_t_tmem));
});
//
// Layout_TV -- Find the (tid,vid) -> tile coord transformation
//
// Apply the tiler to a reference and transform the codomain
// tile_coord -> tmem_coord
auto tile2tmem = composition(make_layout(flat_tmem_shape), tiler);
// wid -> tile_coord
auto layout_tv = composition(left_inverse(tile2tmem), layout_t_tmem);
return make_tiler_impl(layout_tv, tiler);
}
namespace SM100::TMEM::LOAD {
//
// Specialized copy_unpack implementation for SM100::TMEM::LOAD instructions
//
template <class CopyOp,
class TS, class SLayout,
class TD, class DLayout>
CUTE_HOST_DEVICE constexpr
void
copy_unpack(Copy_Traits<CopyOp> const& traits,
Tensor<TS,SLayout> const& src,
Tensor<TD,DLayout> & dst)
{
static_assert(is_tmem<TS>::value, "Expected TMEM src.");
static_assert(is_rmem<TD>::value, "Expected RMEM dst.");
using SrcType = typename TS::value_type;
CUTE_STATIC_ASSERT_V((coalesce(layout(src)) == coalesce(upcast<sizeof_bits<SrcType>::value>(typename Copy_Traits<CopyOp>::ValID{}))),
"Expected src to have the specific TMEM layout required by CopyOp.");
uint32_t tmem_addr = raw_pointer_cast(src.data());
using RegTypeDst = typename remove_extent<typename CopyOp::DRegisters>::type;
Tensor rD = recast<RegTypeDst>(dst);
constexpr int RegNumDst = extent<typename CopyOp::DRegisters>::value;
CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
"In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this CopyOp.");
// thread idx <=> DP lane assert.
// ASSERT TMEM_LOAD thread attemping to access DP lane within sub-partition.
#if defined(__CUDA_ARCH__) && !defined(NDEBUG)
assert(((uint32_t(threadIdx.x) / 32) % 4) == (((tmem_addr >> 16) / 32) % 4));
#endif
detail::explode(CopyOp::copy,
&tmem_addr, seq<0>{},
rD, make_seq<RegNumDst>{});
}
} // end namespace SM100::TMEM::LOAD
namespace SM100::TMEM::STORE {
//
// Specialized copy_unpack implementation for SM100::TMEM::STORE instructions
//
template <class CopyOp,
class TS, class SLayout,
class TD, class DLayout>
CUTE_HOST_DEVICE constexpr
void
copy_unpack(Copy_Traits<CopyOp> const& traits,
Tensor<TS,SLayout> const& src,
Tensor<TD,DLayout> & dst)
{
static_assert(is_rmem<TS>::value, "Expected RMEM src.");
static_assert(is_tmem<TD>::value, "Expected TMEM dst.");
using RegTypeSrc = typename remove_extent<typename CopyOp::SRegisters>::type;
Tensor rS = recast<RegTypeSrc>(src);
constexpr int RegNumSrc = extent<typename CopyOp::SRegisters>::value;
CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
"In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
using DstType = typename TD::value_type;
CUTE_STATIC_ASSERT_V((coalesce(layout(dst)) == coalesce(upcast<sizeof_bits<DstType>::value>(typename Copy_Traits<CopyOp>::ValID{}))),
"Expected dst to have the specific TMEM layout required by CopyOp.");
uint32_t tmem_addr = raw_pointer_cast(dst.data());
// thread idx <=> DP lane assert.
// ASSERT TMEM_LOAD thread attemping to access DP lane within sub-partition.
#if defined(__CUDA_ARCH__) && !defined(NDEBUG)
assert(((uint32_t(threadIdx.x) / 32) % 4) == (((tmem_addr >> 16) / 32) % 4));
#endif
detail::explode(CopyOp::copy,
rS, make_seq<RegNumSrc>{},
&tmem_addr, seq<0>{});
}
} // end namespace SM100::TMEM::STORE
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// TMEM_LOAD Copy Traits
//
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b1x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>
{
// Logical thread id to thread idx (warp)
using ThrID = Layout<_32>;
// Logical bit id to bit idx (address)
using ValID = Layout<Shape <_256, _16>,
Stride< _1,TMEM::DP_b>>;
// Map from (src-thr,src-val) to bit
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
// Map from (dst-thr,dst-val) to bit
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2>>,
Stride<Stride<_64,_256>,Stride< _1,_2048>>>;
// Reference map from (thr,val) to bit
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_16>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2>>,
Stride<Stride<_64,_256>,Stride< _1,_2048>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b2x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_512, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _2>>,
Stride<Stride<_64,_512>,Stride< _1,_4096,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_32>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _2>>,
Stride<Stride<_64,_512>,Stride< _1,_4096,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b4x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_1024, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _4>>,
Stride<Stride<_64,_1024>,Stride< _1,_8192,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_64>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _4>>,
Stride<Stride<_64,_1024>,Stride< _1,_8192,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b8x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_2048, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _8>>,
Stride<Stride<_64,_2048>,Stride< _1,_16384,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_128>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _8>>,
Stride<Stride<_64,_2048>,Stride< _1,_16384,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b16x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_4096, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _16>>,
Stride<Stride<_64,_4096>,Stride< _1,_32768,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_256>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _16>>,
Stride<Stride<_64,_4096>,Stride< _1,_32768,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b32x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_8192, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _32>>,
Stride<Stride<_64,_8192>,Stride< _1,_65536,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp256b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_512>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_64, _2, _32>>,
Stride<Stride<_64,_8192>,Stride< _1,_65536,_256>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b1x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_128, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2>>,
Stride<Stride<_32,_128>,Stride< _1,_1024>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _8>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2>>,
Stride<Stride<_32,_128>,Stride< _1,_1024>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b2x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_256, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _2>>,
Stride<Stride<_32,_256>,Stride< _1,_2048,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_16>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _2>>,
Stride<Stride<_32,_256>,Stride< _1,_2048,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b4x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_512, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _4>>,
Stride<Stride<_32,_512>,Stride< _1,_4096,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_32>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _4>>,
Stride<Stride<_32,_512>,Stride< _1,_4096,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b8x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_1024, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _8>>,
Stride<Stride<_32,_1024>,Stride< _1,_8192,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_64>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _8>>,
Stride<Stride<_32,_1024>,Stride< _1,_8192,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b16x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_2048, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _16>>,
Stride<Stride<_32,_2048>,Stride< _1,_16384,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_128>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _16>>,
Stride<Stride<_32,_2048>,Stride< _1,_16384,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b32x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_4096, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _32>>,
Stride<Stride<_32,_4096>,Stride< _1,_32768,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_256>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _32>>,
Stride<Stride<_32,_4096>,Stride< _1,_32768,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b64x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_8192, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _64>>,
Stride<Stride<_32,_8192>,Stride< _1,_65536,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp128b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_512>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _4, _8>,Shape <_32, _2, _64>>,
Stride<Stride<_32,_8192>,Stride< _1,_65536,_128>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b1x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_64, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_1024>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,_32>,
Stride<Stride<_512,_32,_64>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _4>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_1024>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,_32>,
Stride<Stride<_512,_32,_64>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b2x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_128, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32, _2>>,
Stride<Stride<_1024,_32,_128>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _8>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32, _2>>,
Stride<Stride<_1024,_32,_128>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b4x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_256, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32, _4>>,
Stride<Stride<_2048,_32,_256>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_16>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32, _4>>,
Stride<Stride<_2048,_32,_256>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b8x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_512, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32, _8>>,
Stride<Stride<_4096,_32,_512>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_32>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32, _8>>,
Stride<Stride<_4096,_32,_512>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b16x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_1024, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_16>>,
Stride<Stride<_8192,_32,_1024>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_64>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_16>>,
Stride<Stride<_8192,_32,_1024>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b32x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_2048, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_32>>,
Stride<Stride<_16384,_32,_2048>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_128>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_32>>,
Stride<Stride<_16384,_32,_2048>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b64x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_4096, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_64>>,
Stride<Stride<_32768,_32,_4096>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_256>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_64>>,
Stride<Stride<_32768,_32,_4096>,Stride< _1,_64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b128x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_8192, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_128>>,
Stride<Stride<_65536,_32,_8192>,Stride< _1, _64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp64b128x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_512>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _2, _2, _8>,Shape <_32,_128>>,
Stride<Stride<_65536,_32,_8192>,Stride< _1, _64>>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b1x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_64, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_1024>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape <_16, _2>,_32>,
Stride<Stride<_64,_32>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _4>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_1024>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape <_16, _2>,_32>,
Stride<Stride<_64,_32>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b2x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_128, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_64>,
Stride<Stride<_128,_64>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _8>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_64>,
Stride<Stride<_128,_64>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b4x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_256, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_128>,
Stride<Stride<_256,_128>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_16>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_128>,
Stride<Stride<_256,_128>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b8x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_512, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_256>,
Stride<Stride<_512,_256>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_32>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_256>,
Stride<Stride<_512,_256>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b16x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_1024, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_512>,
Stride<Stride<_1024,_512>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_64>, _16>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_512>,
Stride<Stride<_1024,_512>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b32x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_2048, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_1024>,
Stride<Stride<_2048,_1024>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_128>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_1024>,
Stride<Stride<_2048,_1024>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b64x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_4096, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_2048>,
Stride<Stride<_4096,_2048>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_256>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_2048>,
Stride<Stride<_4096,_2048>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b128x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_8192, _16>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_4096>,
Stride<Stride<_8192,_4096>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_16dp32b128x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_512>, _16>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <Shape < _16, _2>,_4096>,
Stride<Stride<_8192,_4096>, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b1x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_32, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_1024>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <_32,_32>,
Stride<_32, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _2>, _32>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_1024>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <_32,_32>,
Stride<_32, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b2x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_64, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <_32,_64>,
Stride<_64, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _4>, _32>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_2048>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape <_32,_64>,
Stride<_64, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b4x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_128, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_128>,
Stride<_128, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16, _8>, _32>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_4096>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_128>,
Stride<_128, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b8x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_256, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_256>,
Stride<_256, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_16>, _32>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_8192>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_256>,
Stride<_256, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b16x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_512, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_512>,
Stride<_512, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_32>, _32>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_16384>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_512>,
Stride<_512, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b32x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_1024, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_1024>,
Stride<_1024, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_64>, _32>,
Stride<Stride< _1,_32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_32768>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_1024>,
Stride<_1024, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b64x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_2048, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_2048>,
Stride<_2048, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_128>, _32>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_65536>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_2048>,
Stride<_2048, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b128x;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <_4096, _32>,
Stride< _1,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_4096>,
Stride<_4096, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::LOAD::SM100_TMEM_LOAD_32dp32b128x_16b;
template <>
struct Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>
{
using ThrID = Layout<_32>;
using ValID = Layout<Shape <Shape <_16,_256>, _32>,
Stride<Stride< _1, _32>,TMEM::DP_b>>;
using SrcLayout = Layout<Shape <_32,_131072>,
Stride< _0, _1>>;
using DstLayout = Layout<Shape < _32,_4096>,
Stride<_4096, _1>>;
using RefLayout = SrcLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// TMEM_STORE Copy Traits
//
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b1x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b1x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b1x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b1x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b2x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b2x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b2x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b2x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b4x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b4x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b4x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b4x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b8x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b8x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b8x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b8x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b16x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b16x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b16x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b16x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b32x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b32x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp256b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp256b32x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp256b32x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b1x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b1x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b1x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b1x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b2x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b2x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b2x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b2x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b4x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b4x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b4x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b4x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b8x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b8x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b8x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b8x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b16x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b16x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b16x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b16x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b32x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b32x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b32x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b32x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b64x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b64x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp128b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp128b64x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp128b64x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b1x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b1x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b1x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b1x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b2x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b2x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b2x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b2x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b4x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b4x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b4x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b4x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b8x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b8x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b8x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b8x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b16x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b16x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b16x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b16x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b32x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b32x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b32x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b32x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b64x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b64x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b64x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b64x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b128x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b128x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp64b128x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp64b128x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp64b128x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b1x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b1x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b1x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b1x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b2x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b2x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b2x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b2x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b4x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b4x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b4x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b4x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b8x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b8x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b8x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b8x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b16x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b16x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b16x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b16x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b32x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b32x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b32x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b32x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b64x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b64x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b64x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b64x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b128x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b128x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_16dp32b128x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_16dp32b128x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_16dp32b128x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b1x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b1x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b1x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b1x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b1x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b2x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b2x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b2x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b2x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b2x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b4x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b4x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b4x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b4x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b4x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b8x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b8x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b8x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b8x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b8x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b16x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b16x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b16x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b16x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b16x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b32x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b32x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b32x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b32x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b32x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b64x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b64x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b64x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b64x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b64x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b128x;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b128x>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::STORE::SM100_TMEM_STORE_32dp32b128x_16b;
template <>
struct Copy_Traits<SM100_TMEM_STORE_32dp32b128x_16b>
{
using ThrID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::ThrID;
using ValID = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::ValID;
using SrcLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::DstLayout;
using DstLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::SrcLayout;
using RefLayout = typename Copy_Traits<SM100_TMEM_LOAD_32dp32b128x_16b>::RefLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
namespace TMEM {
////////////////////////////////////////////////////////////////////////////////////////////////////
// Given a 1x tmem copy op, returns the widest repeated variant that divides the specified bits in the N-mode
template <class CopyOp, int bits_n>
CUTE_HOST_DEVICE constexpr
auto
op_repeater()
{
if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b1x>) {
if constexpr (bits_n % (256 * 32) == 0) {
return SM100_TMEM_LOAD_16dp256b32x{};
}
else if constexpr (bits_n % (256 * 16) == 0) {
return SM100_TMEM_LOAD_16dp256b16x{};
}
else if constexpr (bits_n % (256 * 8) == 0) {
return SM100_TMEM_LOAD_16dp256b8x{};
}
else if constexpr (bits_n % (256 * 4) == 0) {
return SM100_TMEM_LOAD_16dp256b4x{};
}
else if constexpr (bits_n % (256 * 2) == 0) {
return SM100_TMEM_LOAD_16dp256b2x{};
}
else if constexpr (bits_n % (256 * 1) == 0) {
return SM100_TMEM_LOAD_16dp256b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b1x_16b>) {
if constexpr (bits_n % (256 * 32) == 0) {
return SM100_TMEM_LOAD_16dp256b32x_16b{};
}
else if constexpr (bits_n % (256 * 16) == 0) {
return SM100_TMEM_LOAD_16dp256b16x_16b{};
}
else if constexpr (bits_n % (256 * 8) == 0) {
return SM100_TMEM_LOAD_16dp256b8x_16b{};
}
else if constexpr (bits_n % (256 * 4) == 0) {
return SM100_TMEM_LOAD_16dp256b4x_16b{};
}
else if constexpr (bits_n % (256 * 2) == 0) {
return SM100_TMEM_LOAD_16dp256b2x_16b{};
}
else if constexpr (bits_n % (256 * 1) == 0) {
return SM100_TMEM_LOAD_16dp256b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b1x>) {
if constexpr (bits_n % (128 * 64) == 0) {
return SM100_TMEM_LOAD_16dp128b64x{};
}
else if constexpr (bits_n % (128 * 32) == 0) {
return SM100_TMEM_LOAD_16dp128b32x{};
}
else if constexpr (bits_n % (128 * 16) == 0) {
return SM100_TMEM_LOAD_16dp128b16x{};
}
else if constexpr (bits_n % (128 * 8) == 0) {
return SM100_TMEM_LOAD_16dp128b8x{};
}
else if constexpr (bits_n % (128 * 4) == 0) {
return SM100_TMEM_LOAD_16dp128b4x{};
}
else if constexpr (bits_n % (128 * 2) == 0) {
return SM100_TMEM_LOAD_16dp128b2x{};
}
else if constexpr (bits_n % (128 * 1) == 0) {
return SM100_TMEM_LOAD_16dp128b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b1x_16b>) {
if constexpr (bits_n % (128 * 64) == 0) {
return SM100_TMEM_LOAD_16dp128b64x_16b{};
}
else if constexpr (bits_n % (128 * 32) == 0) {
return SM100_TMEM_LOAD_16dp128b32x_16b{};
}
else if constexpr (bits_n % (128 * 16) == 0) {
return SM100_TMEM_LOAD_16dp128b16x_16b{};
}
else if constexpr (bits_n % (128 * 8) == 0) {
return SM100_TMEM_LOAD_16dp128b8x_16b{};
}
else if constexpr (bits_n % (128 * 4) == 0) {
return SM100_TMEM_LOAD_16dp128b4x_16b{};
}
else if constexpr (bits_n % (128 * 2) == 0) {
return SM100_TMEM_LOAD_16dp128b2x_16b{};
}
else if constexpr (bits_n % (128 * 1) == 0) {
return SM100_TMEM_LOAD_16dp128b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b1x>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_LOAD_16dp64b128x{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_LOAD_16dp64b64x{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_LOAD_16dp64b32x{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_LOAD_16dp64b16x{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_LOAD_16dp64b8x{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_LOAD_16dp64b4x{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_LOAD_16dp64b2x{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_LOAD_16dp64b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b1x_16b>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_LOAD_16dp64b128x_16b{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_LOAD_16dp64b64x_16b{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_LOAD_16dp64b32x_16b{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_LOAD_16dp64b16x_16b{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_LOAD_16dp64b8x_16b{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_LOAD_16dp64b4x_16b{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_LOAD_16dp64b2x_16b{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_LOAD_16dp64b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b1x>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_LOAD_16dp32b128x{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_LOAD_16dp32b64x{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_LOAD_16dp32b32x{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_LOAD_16dp32b16x{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_LOAD_16dp32b8x{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_LOAD_16dp32b4x{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_LOAD_16dp32b2x{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_LOAD_16dp32b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b1x_16b>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_LOAD_16dp32b128x_16b{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_LOAD_16dp32b64x_16b{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_LOAD_16dp32b32x_16b{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_LOAD_16dp32b16x_16b{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_LOAD_16dp32b8x_16b{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_LOAD_16dp32b4x_16b{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_LOAD_16dp32b2x_16b{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_LOAD_16dp32b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b1x>) {
if constexpr (bits_n % (32 * 128) == 0) {
return SM100_TMEM_LOAD_32dp32b128x{};
}
else if constexpr (bits_n % (32 * 64) == 0) {
return SM100_TMEM_LOAD_32dp32b64x{};
}
else if constexpr (bits_n % (32 * 32) == 0) {
return SM100_TMEM_LOAD_32dp32b32x{};
}
else if constexpr (bits_n % (32 * 16) == 0) {
return SM100_TMEM_LOAD_32dp32b16x{};
}
else if constexpr (bits_n % (32 * 8) == 0) {
return SM100_TMEM_LOAD_32dp32b8x{};
}
else if constexpr (bits_n % (32 * 4) == 0) {
return SM100_TMEM_LOAD_32dp32b4x{};
}
else if constexpr (bits_n % (32 * 2) == 0) {
return SM100_TMEM_LOAD_32dp32b2x{};
}
else if constexpr (bits_n % (32 * 1) == 0) {
return SM100_TMEM_LOAD_32dp32b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b1x_16b>) {
if constexpr (bits_n % (32 * 128) == 0) {
return SM100_TMEM_LOAD_32dp32b128x_16b{};
}
else if constexpr (bits_n % (32 * 64) == 0) {
return SM100_TMEM_LOAD_32dp32b64x_16b{};
}
else if constexpr (bits_n % (32 * 32) == 0) {
return SM100_TMEM_LOAD_32dp32b32x_16b{};
}
else if constexpr (bits_n % (32 * 16) == 0) {
return SM100_TMEM_LOAD_32dp32b16x_16b{};
}
else if constexpr (bits_n % (32 * 8) == 0) {
return SM100_TMEM_LOAD_32dp32b8x_16b{};
}
else if constexpr (bits_n % (32 * 4) == 0) {
return SM100_TMEM_LOAD_32dp32b4x_16b{};
}
else if constexpr (bits_n % (32 * 2) == 0) {
return SM100_TMEM_LOAD_32dp32b2x_16b{};
}
else if constexpr (bits_n % (32 * 1) == 0) {
return SM100_TMEM_LOAD_32dp32b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp256b1x>) {
if constexpr (bits_n % (256 * 32) == 0) {
return SM100_TMEM_STORE_16dp256b32x{};
}
else if constexpr (bits_n % (256 * 16) == 0) {
return SM100_TMEM_STORE_16dp256b16x{};
}
else if constexpr (bits_n % (256 * 8) == 0) {
return SM100_TMEM_STORE_16dp256b8x{};
}
else if constexpr (bits_n % (256 * 4) == 0) {
return SM100_TMEM_STORE_16dp256b4x{};
}
else if constexpr (bits_n % (256 * 2) == 0) {
return SM100_TMEM_STORE_16dp256b2x{};
}
else if constexpr (bits_n % (256 * 1) == 0) {
return SM100_TMEM_STORE_16dp256b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp256b1x_16b>) {
if constexpr (bits_n % (256 * 32) == 0) {
return SM100_TMEM_STORE_16dp256b32x_16b{};
}
else if constexpr (bits_n % (256 * 16) == 0) {
return SM100_TMEM_STORE_16dp256b16x_16b{};
}
else if constexpr (bits_n % (256 * 8) == 0) {
return SM100_TMEM_STORE_16dp256b8x_16b{};
}
else if constexpr (bits_n % (256 * 4) == 0) {
return SM100_TMEM_STORE_16dp256b4x_16b{};
}
else if constexpr (bits_n % (256 * 2) == 0) {
return SM100_TMEM_STORE_16dp256b2x_16b{};
}
else if constexpr (bits_n % (256 * 1) == 0) {
return SM100_TMEM_STORE_16dp256b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp128b1x>) {
if constexpr (bits_n % (128 * 64) == 0) {
return SM100_TMEM_STORE_16dp128b64x{};
}
else if constexpr (bits_n % (128 * 32) == 0) {
return SM100_TMEM_STORE_16dp128b32x{};
}
else if constexpr (bits_n % (128 * 16) == 0) {
return SM100_TMEM_STORE_16dp128b16x{};
}
else if constexpr (bits_n % (128 * 8) == 0) {
return SM100_TMEM_STORE_16dp128b8x{};
}
else if constexpr (bits_n % (128 * 4) == 0) {
return SM100_TMEM_STORE_16dp128b4x{};
}
else if constexpr (bits_n % (128 * 2) == 0) {
return SM100_TMEM_STORE_16dp128b2x{};
}
else if constexpr (bits_n % (128 * 1) == 0) {
return SM100_TMEM_STORE_16dp128b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp128b1x_16b>) {
if constexpr (bits_n % (128 * 64) == 0) {
return SM100_TMEM_STORE_16dp128b64x_16b{};
}
else if constexpr (bits_n % (128 * 32) == 0) {
return SM100_TMEM_STORE_16dp128b32x_16b{};
}
else if constexpr (bits_n % (128 * 16) == 0) {
return SM100_TMEM_STORE_16dp128b16x_16b{};
}
else if constexpr (bits_n % (128 * 8) == 0) {
return SM100_TMEM_STORE_16dp128b8x_16b{};
}
else if constexpr (bits_n % (128 * 4) == 0) {
return SM100_TMEM_STORE_16dp128b4x_16b{};
}
else if constexpr (bits_n % (128 * 2) == 0) {
return SM100_TMEM_STORE_16dp128b2x_16b{};
}
else if constexpr (bits_n % (128 * 1) == 0) {
return SM100_TMEM_STORE_16dp128b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp64b1x>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_STORE_16dp64b128x{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_STORE_16dp64b64x{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_STORE_16dp64b32x{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_STORE_16dp64b16x{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_STORE_16dp64b8x{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_STORE_16dp64b4x{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_STORE_16dp64b2x{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_STORE_16dp64b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp64b1x_16b>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_STORE_16dp64b128x_16b{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_STORE_16dp64b64x_16b{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_STORE_16dp64b32x_16b{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_STORE_16dp64b16x_16b{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_STORE_16dp64b8x_16b{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_STORE_16dp64b4x_16b{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_STORE_16dp64b2x_16b{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_STORE_16dp64b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp32b1x>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_STORE_16dp32b128x{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_STORE_16dp32b64x{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_STORE_16dp32b32x{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_STORE_16dp32b16x{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_STORE_16dp32b8x{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_STORE_16dp32b4x{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_STORE_16dp32b2x{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_STORE_16dp32b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_16dp32b1x_16b>) {
if constexpr (bits_n % (64 * 128) == 0) {
return SM100_TMEM_STORE_16dp32b128x_16b{};
}
else if constexpr (bits_n % (64 * 64) == 0) {
return SM100_TMEM_STORE_16dp32b64x_16b{};
}
else if constexpr (bits_n % (64 * 32) == 0) {
return SM100_TMEM_STORE_16dp32b32x_16b{};
}
else if constexpr (bits_n % (64 * 16) == 0) {
return SM100_TMEM_STORE_16dp32b16x_16b{};
}
else if constexpr (bits_n % (64 * 8) == 0) {
return SM100_TMEM_STORE_16dp32b8x_16b{};
}
else if constexpr (bits_n % (64 * 4) == 0) {
return SM100_TMEM_STORE_16dp32b4x_16b{};
}
else if constexpr (bits_n % (64 * 2) == 0) {
return SM100_TMEM_STORE_16dp32b2x_16b{};
}
else if constexpr (bits_n % (64 * 1) == 0) {
return SM100_TMEM_STORE_16dp32b1x_16b{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_32dp32b1x>) {
if constexpr (bits_n % (32 * 128) == 0) {
return SM100_TMEM_STORE_32dp32b128x{};
}
else if constexpr (bits_n % (32 * 64) == 0) {
return SM100_TMEM_STORE_32dp32b64x{};
}
else if constexpr (bits_n % (32 * 32) == 0) {
return SM100_TMEM_STORE_32dp32b32x{};
}
else if constexpr (bits_n % (32 * 16) == 0) {
return SM100_TMEM_STORE_32dp32b16x{};
}
else if constexpr (bits_n % (32 * 8) == 0) {
return SM100_TMEM_STORE_32dp32b8x{};
}
else if constexpr (bits_n % (32 * 4) == 0) {
return SM100_TMEM_STORE_32dp32b4x{};
}
else if constexpr (bits_n % (32 * 2) == 0) {
return SM100_TMEM_STORE_32dp32b2x{};
}
else if constexpr (bits_n % (32 * 1) == 0) {
return SM100_TMEM_STORE_32dp32b1x{};
}
}
else if constexpr (cute::is_same_v<CopyOp, SM100_TMEM_STORE_32dp32b1x_16b>) {
if constexpr (bits_n % (32 * 128) == 0) {
return SM100_TMEM_STORE_32dp32b128x_16b{};
}
else if constexpr (bits_n % (32 * 64) == 0) {
return SM100_TMEM_STORE_32dp32b64x_16b{};
}
else if constexpr (bits_n % (32 * 32) == 0) {
return SM100_TMEM_STORE_32dp32b32x_16b{};
}
else if constexpr (bits_n % (32 * 16) == 0) {
return SM100_TMEM_STORE_32dp32b16x_16b{};
}
else if constexpr (bits_n % (32 * 8) == 0) {
return SM100_TMEM_STORE_32dp32b8x_16b{};
}
else if constexpr (bits_n % (32 * 4) == 0) {
return SM100_TMEM_STORE_32dp32b4x_16b{};
}
else if constexpr (bits_n % (32 * 2) == 0) {
return SM100_TMEM_STORE_32dp32b2x_16b{};
}
else if constexpr (bits_n % (32 * 1) == 0) {
return SM100_TMEM_STORE_32dp32b1x_16b{};
}
}
else {
static_assert(dependent_false<CopyOp>, "Must pass 1x tmem copy operator");
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Select TMEM store corresponding to the provided TMEM load
template <class CopyOp>
CUTE_HOST_DEVICE constexpr auto
tmem_load_to_store(CopyOp) {
if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b1x>) {
return SM100_TMEM_STORE_16dp256b1x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b1x_16b>) {
return SM100_TMEM_STORE_16dp256b1x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b2x>) {
return SM100_TMEM_STORE_16dp256b2x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b2x_16b>) {
return SM100_TMEM_STORE_16dp256b2x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b4x>) {
return SM100_TMEM_STORE_16dp256b4x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b4x_16b>) {
return SM100_TMEM_STORE_16dp256b4x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b8x>) {
return SM100_TMEM_STORE_16dp256b8x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b8x_16b>) {
return SM100_TMEM_STORE_16dp256b8x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b16x>) {
return SM100_TMEM_STORE_16dp256b16x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b16x_16b>) {
return SM100_TMEM_STORE_16dp256b16x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b32x>) {
return SM100_TMEM_STORE_16dp256b32x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp256b32x_16b>) {
return SM100_TMEM_STORE_16dp256b32x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b1x>) {
return SM100_TMEM_STORE_16dp128b1x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b1x_16b>) {
return SM100_TMEM_STORE_16dp128b1x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b2x>) {
return SM100_TMEM_STORE_16dp128b2x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b2x_16b>) {
return SM100_TMEM_STORE_16dp128b2x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b4x>) {
return SM100_TMEM_STORE_16dp128b4x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b4x_16b>) {
return SM100_TMEM_STORE_16dp128b4x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b8x>) {
return SM100_TMEM_STORE_16dp128b8x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b8x_16b>) {
return SM100_TMEM_STORE_16dp128b8x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b16x>) {
return SM100_TMEM_STORE_16dp128b16x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b16x_16b>) {
return SM100_TMEM_STORE_16dp128b16x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b32x>) {
return SM100_TMEM_STORE_16dp128b32x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b32x_16b>) {
return SM100_TMEM_STORE_16dp128b32x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b64x>) {
return SM100_TMEM_STORE_16dp128b64x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp128b64x_16b>) {
return SM100_TMEM_STORE_16dp128b64x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b1x>) {
return SM100_TMEM_STORE_16dp64b1x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b1x_16b>) {
return SM100_TMEM_STORE_16dp64b1x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b2x>) {
return SM100_TMEM_STORE_16dp64b2x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b2x_16b>) {
return SM100_TMEM_STORE_16dp64b2x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b4x>) {
return SM100_TMEM_STORE_16dp64b4x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b4x_16b>) {
return SM100_TMEM_STORE_16dp64b4x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b8x>) {
return SM100_TMEM_STORE_16dp64b8x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b8x_16b>) {
return SM100_TMEM_STORE_16dp64b8x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b16x>) {
return SM100_TMEM_STORE_16dp64b16x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b16x_16b>) {
return SM100_TMEM_STORE_16dp64b16x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b32x>) {
return SM100_TMEM_STORE_16dp64b32x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b32x_16b>) {
return SM100_TMEM_STORE_16dp64b32x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b64x>) {
return SM100_TMEM_STORE_16dp64b64x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b64x_16b>) {
return SM100_TMEM_STORE_16dp64b64x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b128x>) {
return SM100_TMEM_STORE_16dp64b128x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp64b128x_16b>) {
return SM100_TMEM_STORE_16dp64b128x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b1x>) {
return SM100_TMEM_STORE_16dp32b1x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b1x_16b>) {
return SM100_TMEM_STORE_16dp32b1x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b2x>) {
return SM100_TMEM_STORE_16dp32b2x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b2x_16b>) {
return SM100_TMEM_STORE_16dp32b2x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b4x>) {
return SM100_TMEM_STORE_16dp32b4x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b4x_16b>) {
return SM100_TMEM_STORE_16dp32b4x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b8x>) {
return SM100_TMEM_STORE_16dp32b8x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b8x_16b>) {
return SM100_TMEM_STORE_16dp32b8x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b16x>) {
return SM100_TMEM_STORE_16dp32b16x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b16x_16b>) {
return SM100_TMEM_STORE_16dp32b16x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b32x>) {
return SM100_TMEM_STORE_16dp32b32x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b32x_16b>) {
return SM100_TMEM_STORE_16dp32b32x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b64x>) {
return SM100_TMEM_STORE_16dp32b64x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b64x_16b>) {
return SM100_TMEM_STORE_16dp32b64x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b128x>) {
return SM100_TMEM_STORE_16dp32b128x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_16dp32b128x_16b>) {
return SM100_TMEM_STORE_16dp32b128x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b1x>) {
return SM100_TMEM_STORE_32dp32b1x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b1x_16b>) {
return SM100_TMEM_STORE_32dp32b1x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b2x>) {
return SM100_TMEM_STORE_32dp32b2x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b2x_16b>) {
return SM100_TMEM_STORE_32dp32b2x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b4x>) {
return SM100_TMEM_STORE_32dp32b4x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b4x_16b>) {
return SM100_TMEM_STORE_32dp32b4x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b8x>) {
return SM100_TMEM_STORE_32dp32b8x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b8x_16b>) {
return SM100_TMEM_STORE_32dp32b8x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b16x>) {
return SM100_TMEM_STORE_32dp32b16x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b16x_16b>) {
return SM100_TMEM_STORE_32dp32b16x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b32x>) {
return SM100_TMEM_STORE_32dp32b32x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b32x_16b>) {
return SM100_TMEM_STORE_32dp32b32x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b64x>) {
return SM100_TMEM_STORE_32dp32b64x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b64x_16b>) {
return SM100_TMEM_STORE_32dp32b64x_16b{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b128x>) {
return SM100_TMEM_STORE_32dp32b128x{};
}
else if constexpr (is_same_v<CopyOp, SM100_TMEM_LOAD_32dp32b128x_16b>) {
return SM100_TMEM_STORE_32dp32b128x_16b{};
}
else {
static_assert(dependent_false<CopyOp>, "No TMEM_STORE matching for provided TMEM_LOAD");
}
}
} // namespace TMEM
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// UTCCP Copy Traits
//
////////////////////////////////////////////////////////////////////////////////////////////////////
namespace SM100::TMEM::UTCCP {
//
// Specialized copy_unpack implementation for SM100::TMEM::UTCCP instructions
//
template <class CopyOp,
class TS, class SLayout,
class TD, class DLayout>
CUTE_HOST_DEVICE constexpr
void
copy_unpack(Copy_Traits<CopyOp> const&,
Tensor<TS,SLayout> const& src,
Tensor<TD,DLayout> & dst)
{
static_assert(is_rmem<TS>::value, "Expected smem_desc src for SM100_UTCCP");
static_assert(is_tmem<TD>::value, "Expected tmem dst for SM100_UTCCP");
CopyOp::copy(src[0], raw_pointer_cast(dst.data()));
}
} // end namespace SM100::TMEM::UTCCP
// In the following UTCCP traits, the ValID is representing:
// logical_bit_idx -> tmem_addr_offset.
// And the logical_bit_idx is numbered in the order of:
// [core_matrix_strided, core_matrix_leading, broadcast, repeat].
// The first two modes provide convenience for smem_desc construtction.
// The last two modes provide boradcast transformation for 4x32DP and 2x64DP.
// With above, the strides of first two modes are neccessary to be TMEM::DP_b and 1.
// And the stride of the third mode in the SrcLayout must be zero.
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_128dp256bit_1cta;
template <>
struct Copy_Traits<SM100_UTCCP_128dp256bit_1cta>
{
using ThrID = Layout<_1>;
using ValID = Layout<Shape <_128, _256>,
Stride<TMEM::DP_b, _1>>;
using SrcLayout = Layout<Shape<_1, _32768>,
Stride<_0, _1>>;
using DstLayout = Layout<Shape<_1, _32768>,
Stride<_0,_1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_128dp256bit_2cta;
template <>
struct Copy_Traits<SM100_UTCCP_128dp256bit_2cta>
{
using ThrID = Layout<_2>;
using ValID = typename Copy_Traits<SM100_UTCCP_128dp256bit_1cta>::ValID;
using SrcLayout = Layout<Shape <_2, _32768>,
Stride<_0, _1>>;
using DstLayout = Layout<Shape <_2, _32768>,
Stride<_0, _1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_128dp128bit_1cta;
template <>
struct Copy_Traits<SM100_UTCCP_128dp128bit_1cta>
{
using ThrID = Layout<_1>;
using ValID = Layout<Shape <_128, _128>,
Stride<TMEM::DP_b, _1>>;
using SrcLayout = Layout<Shape<_1, _16384>,
Stride<_0, _1>>;
using DstLayout = Layout<Shape<_1, _16384>,
Stride<_0,_1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_128dp128bit_2cta;
template <>
struct Copy_Traits<SM100_UTCCP_128dp128bit_2cta>
{
using ThrID = Layout<_2>;
using ValID = typename Copy_Traits<SM100_UTCCP_128dp128bit_1cta>::ValID;
using SrcLayout = Layout<Shape <_2, _16384>,
Stride<_0, _1>>;
using DstLayout = Layout<Shape <_2, _16384>,
Stride<_0, _1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_4dp256bit_1cta;
template <>
struct Copy_Traits<SM100_UTCCP_4dp256bit_1cta>
{
/*
4DP is really hard to model if we consider this instruction as a "copy" instruction.
But, if we take it as "TMEM refresh" instruction, then everything goes out naturally.
4DP utccp is designed to refresh the last 4 lanes of each tmem subpartition.
So, in the kernel implementation, we usually only don't need to iterate on MMA_M dimension,
but only need to iterate on MMA_K dimension.
And in each refresh, logically we are refreshing MMA's 128 rows M + 256bit K.
So the "atom_v" should be (refresh_m, refresh_k) instead of (copy_m, copy_k).
And the Src/DstLayout below is: copy_bits -> logical_refresh_bits.
*/
using ThrID = Layout<_1>;
using ValID = Layout<Shape <_128, _256>,
Stride<TMEM::DP_b,_1>>;
using SrcLayout = Layout<Shape <_1,Shape <_4, _256>>,
Stride<_0,Stride<_32,_128>>>;
using DstLayout = Layout<Shape <_1,Shape <_4, _256>>,
Stride<_0,Stride<_32,_128>>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_4dp256bit_2cta;
template <>
struct Copy_Traits<SM100_UTCCP_4dp256bit_2cta>
{
using ThrID = Layout<_2>;
using ValID = typename Copy_Traits<SM100_UTCCP_4dp256bit_1cta>::ValID;
using SrcLayout = Layout<Shape <_2,Shape <_4, _256>>,
Stride<_0,Stride<_32,_128>>>;
using DstLayout = Layout<Shape <_2,Shape <_4, _256>>,
Stride<_0,Stride<_32,_128>>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_4x32dp128bit_1cta;
template <>
struct Copy_Traits<SM100_UTCCP_4x32dp128bit_1cta>
{
using _DP = TMEM::DP_b;
using _DPx32 = Int<_DP{}*32>;
using ThrID = Layout<_1>;
// logical bit_idx -> tmem_addr
// [core_matrix_strided, core_matrix_leading, broadcast]
using ValID = Layout<Shape <_32,_128,_4>,
Stride<_DP,_1, _DPx32>>;
using SrcLayout = Layout<Shape <_1,Shape <_32,_128,_4>>,
Stride<_0,Stride<_1, _32, _0>>>;
using DstLayout = Layout<Shape <_1,_16384>,
Stride<_0,_1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_4x32dp128bit_2cta;
template <>
struct Copy_Traits<SM100_UTCCP_4x32dp128bit_2cta>
{
using ThrID = Layout<_2>;
using ValID = typename Copy_Traits<SM100_UTCCP_4x32dp128bit_1cta>::ValID;
using SrcLayout = Layout<Shape <_2,Shape <_32,_128,_4>>,
Stride<_0,Stride<_1, _32, _0>>>;
using DstLayout = Layout<Shape<_2, _16384>,
Stride<_0,_1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0213_1cta;
template <>
struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0213_1cta>
{
using _DP = TMEM::DP_b;
using _DPx64 = Int<_DP{}*64>;
using ThrID = Layout<_1>;
// logical bit_idx -> tmem_addr
// [core_matrix_strided, core_matrix_leading, broadcast]
using ValID = Layout<Shape <_64,_128,_2>,
Stride<_DP,_1, _DPx64>>;
using SrcLayout = Layout<Shape <_1,Shape <_64,_128,_2>>,
Stride<_0,Stride<_1, _64, _0>>>;
using DstLayout = Layout<Shape<_1, _16384>,
Stride<_0, _1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0213_2cta;
template <>
struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0213_2cta>
{
using ThrID = Layout<_2>;
using ValID = typename Copy_Traits<SM100_UTCCP_2x64dp128bitlw0213_1cta>::ValID;
using SrcLayout = Layout<Shape <_2,Shape <_64,_128,_2>>,
Stride<_0,Stride<_1, _64, _0>>>;
using DstLayout = Layout<Shape<_2, _16384>,
Stride<_0, _1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0123_1cta;
template <>
struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0123_1cta>
{
using _DP = TMEM::DP_b;
using _DPx32 = Int<_DP{}*32>;
using _DPx64 = Int<_DP{}*64>;
using ThrID = Layout<_1>;
// logical bit_idx -> tmem_addr
// [core_matrix_strided, core_matrix_leading, repeat, broadcast]
using ValID = Layout<Shape <_32,_128,_2, _2>,
Stride<_DP,_1 ,_DPx64,_DPx32>>;
using SrcLayout = Layout<Shape <_1,Shape <_32,_128,_2,_2>>,
Stride<_0,Stride<_1, _32,_4096,_0>>>;
using DstLayout = Layout<Shape<_1, _16384>,
Stride<_0, _1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
using SM100::TMEM::UTCCP::SM100_UTCCP_2x64dp128bitlw0123_2cta;
template <>
struct Copy_Traits<SM100_UTCCP_2x64dp128bitlw0123_2cta>
{
using ThrID = Layout<_2>;
using ValID = typename Copy_Traits<SM100_UTCCP_2x64dp128bitlw0123_1cta>::ValID;
using SrcLayout = Layout<Shape <_2,Shape <_32,_128,_2,_2>>,
Stride<_0,Stride<_1, _32, _4096,_0>>>;
using DstLayout = Layout<Shape <_2,_16384>,
Stride<_0,_1>>;
using RefLayout = DstLayout;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
template <class CopyOp,
class TEngine, class TLayout>
CUTE_HOST_DEVICE constexpr
auto
make_utccp_copy(CopyOp const&,
Tensor<TEngine,TLayout> const& tmem)
{
static_assert(is_tmem<TEngine>::value, "Expected TMEM tensor.");
using T = typename TEngine::value_type;
using Traits = Copy_Traits<CopyOp>;
using Atom = Copy_Atom<Traits, T>;
// atom thr idx -> tmem addr This is the T in the Layout_TV
auto atom_t_layout = make_layout(size(typename Traits::ThrID{}), Int<0>{});
// atom val idx -> tmem addr Cast the CopyOp's value ids to the proper data width
auto atom_v_layout = coalesce(upcast<sizeof_bits<T>::value>(typename Traits::ValID{}));
return make_cotiled_copy(Atom{}, make_layout(atom_t_layout, atom_v_layout), tmem.layout());
}
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cute