@ -123,7 +123,7 @@ transform(Tensor<EngineIn,LayoutIn>&& tensor_in, Tensor<EngineOut,LayoutOut>&& t
|
||||
|
||||
// Similar to std::transform with a binary operation
|
||||
// Takes two tensors as input and one tensor as output.
|
||||
// Applies the binary_op to tensor_in1 and and tensor_in2 and
|
||||
// Applies the binary_op to tensor_in1 and tensor_in2 and
|
||||
// assigns it to tensor_out
|
||||
template <class EngineIn1, class LayoutIn1,
|
||||
class EngineIn2, class LayoutIn2,
|
||||
|
||||
@ -576,7 +576,7 @@ depth(Layout<Shape,Stride> const& layout)
|
||||
|
||||
// Return the codomain shape of a mode
|
||||
// @post size(coshape(@a a)) == cosize(@a a)
|
||||
// @return C Coordinate with smallest elements such that that
|
||||
// @return C Coordinate with smallest elements such that
|
||||
// @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
|
||||
// where sub_layout = get<Is...>(layout).
|
||||
template <int... Is, class Shape, class Stride>
|
||||
|
||||
@ -527,7 +527,7 @@ public:
|
||||
auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
|
||||
auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
|
||||
|
||||
// Allocate the the accumulators for the (M,N) blk_shape
|
||||
// Allocate the accumulators for the (M,N) blk_shape
|
||||
//
|
||||
// MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
|
||||
auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
|
||||
|
||||
@ -540,7 +540,7 @@ public:
|
||||
auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
|
||||
auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
|
||||
|
||||
// Allocate the the accumulators for the (M,N) blk_shape
|
||||
// Allocate the accumulators for the (M,N) blk_shape
|
||||
Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
|
||||
|
||||
// Order two Math WG's MMA one after the other, helps hide Epilogue
|
||||
|
||||
@ -347,7 +347,7 @@ public:
|
||||
// The number of tiles for which reduction is required is either:
|
||||
// (a) the total number of output tiles (in the case of split-K)
|
||||
// (b) the number of stream-K tiles
|
||||
// To calculate the the total number of output tiles in the split-K case, we
|
||||
// To calcualte the total number of output tiles in the split-K case, we
|
||||
// note that, in the split-K case, the units_per_problem_ member of Params will be
|
||||
// the total number of output tiles.
|
||||
auto reduction_tiles = params.splits_ > 1 ? params.units_per_problem_ : params.sk_tiles_;
|
||||
|
||||
@ -556,7 +556,7 @@ public:
|
||||
constexpr auto WarpThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
|
||||
/// Divide a warp_group_tile into 8x8 warp_tiles to futher reduce the reg usage.
|
||||
/// Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
|
||||
/// Step 0: Step 1: Step 2: Step 3:
|
||||
/// W0 W1 W2 W3 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
|
||||
/// W1 W0 -- -- -- -- -- -- -- -- W3 W2 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
|
||||
|
||||
@ -47,7 +47,7 @@ namespace cutlass {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Wmma array type (WmmaFragmentArray holds elements of of type nvcuda::wmma::fragment)
|
||||
/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
|
||||
template <
|
||||
/// Element type
|
||||
typename T,
|
||||
|
||||
Reference in New Issue
Block a user