3.6.0 update (#2005)

* 3.6.0 update

* doc and swap stuff

---------

Co-authored-by: yuzhai <yuzhai@nvidia.com>
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
Yujia Zhai
2024-12-24 22:34:40 -08:00
committed by GitHub
parent e1cd8c7866
commit 3d261a5974
258 changed files with 10863 additions and 3883 deletions

View File

@ -95,36 +95,17 @@ __global__ void copy_kernel(TensorS S, TensorD D, ThreadLayout)
/// Uses `make_tiled_copy()` to perform a copy using vector instructions. This operation
/// has the precondition that pointers are aligned to the vector size.
///
template <class TensorS, class TensorD, class ThreadLayout, class VecLayout>
__global__ void copy_kernel_vectorized(TensorS S, TensorD D, ThreadLayout, VecLayout)
template <class TensorS, class TensorD, class Tiled_Copy>
__global__ void copy_kernel_vectorized(TensorS S, TensorD D, Tiled_Copy tiled_copy)
{
using namespace cute;
using Element = typename TensorS::value_type;
// Slice the tensors to obtain a view into each tile.
Tensor tile_S = S(make_coord(_, _), blockIdx.x, blockIdx.y); // (BlockShape_M, BlockShape_N)
Tensor tile_D = D(make_coord(_, _), blockIdx.x, blockIdx.y); // (BlockShape_M, BlockShape_N)
// Define `AccessType` which controls the size of the actual memory access.
using AccessType = cutlass::AlignedArray<Element, size(VecLayout{})>;
// A copy atom corresponds to one hardware memory access.
using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;
// Construct tiled copy, a tiling of copy atoms.
//
// Note, this assumes the vector and thread layouts are aligned with contigous data
// in GMEM. Alternative thread layouts are possible but may result in uncoalesced
// reads. Alternative vector layouts are also possible, though incompatible layouts
// will result in compile time errors.
auto tiled_copy =
make_tiled_copy(
Atom{}, // access size
ThreadLayout{}, // thread layout
VecLayout{}); // vector layout (e.g. 4x1)
// Construct a Tensor corresponding to each thread's slice.
auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
ThrCopy thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
Tensor thr_tile_S = thr_copy.partition_S(tile_S); // (CopyOp, CopyM, CopyN)
Tensor thr_tile_D = thr_copy.partition_D(tile_D); // (CopyOp, CopyM, CopyN)
@ -198,11 +179,34 @@ int main(int argc, char** argv)
Tensor tiled_tensor_S = tiled_divide(tensor_S, block_shape); // ((M, N), m', n')
Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape); // ((M, N), m', n')
// Thread arrangement
Layout thr_layout = make_layout(make_shape(Int<32>{}, Int<8>{}));
// Construct a TiledCopy with a specific access pattern.
// This version uses a
// (1) Layout-of-Threads to describe the number and arrangement of threads (e.g. row-major, col-major, etc),
// (2) Layout-of-Values that each thread will access.
// Vector dimensions
Layout vec_layout = make_layout(make_shape(Int<4>{}, Int<1>{}));
// Thread arrangement
Layout thr_layout = make_layout(make_shape(Int<32>{}, Int<8>{})); // (32,8) -> thr_idx
// Value arrangement per thread
Layout val_layout = make_layout(make_shape(Int<4>{}, Int<1>{})); // (4,1) -> val_idx
// Define `AccessType` which controls the size of the actual memory access instruction.
using CopyOp = UniversalCopy<uint_byte_t<sizeof(Element) * size(val_layout)>>; // A very specific access width copy instruction
//using CopyOp = UniversalCopy<cutlass::AlignedArray<Element, size(val_layout)>>; // A more generic type that supports many copy strategies
//using CopyOp = AutoVectorizingCopy; // An adaptable-width instruction that assumes maximal alignment of inputs
// A Copy_Atom corresponds to one CopyOperation applied to Tensors of type Element.
using Atom = Copy_Atom<CopyOp, Element>;
// Construct tiled copy, a tiling of copy atoms.
//
// Note, this assumes the vector and thread layouts are aligned with contigous data
// in GMEM. Alternative thread layouts are possible but may result in uncoalesced
// reads. Alternative value layouts are also possible, though incompatible layouts
// will result in compile time errors.
TiledCopy tiled_copy = make_tiled_copy(Atom{}, // Access strategy
thr_layout, // thread layout (e.g. 32x4 Col-Major)
val_layout); // value layout (e.g. 4x1)
//
// Determine grid and block dimensions
@ -217,8 +221,7 @@ int main(int argc, char** argv)
copy_kernel_vectorized<<< gridDim, blockDim >>>(
tiled_tensor_S,
tiled_tensor_D,
thr_layout,
vec_layout);
tiled_copy);
cudaError result = cudaDeviceSynchronize();
if (result != cudaSuccess) {