Fix several typos (#1169)

Co-authored-by: isaacw <isaacw@nvidia.com>
This commit is contained in:
wang-y-z
2023-11-03 11:54:46 +08:00
committed by GitHub
parent c008b4aea8
commit 557be3ab0e
21 changed files with 30 additions and 30 deletions

View File

@ -123,7 +123,7 @@ transform(Tensor<EngineIn,LayoutIn>&& tensor_in, Tensor<EngineOut,LayoutOut>&& t
// Similar to std::transform with a binary operation
// Takes two tensors as input and one tensor as output.
// Applies the binary_op to tensor_in1 and and tensor_in2 and
// Applies the binary_op to tensor_in1 and tensor_in2 and
// assigns it to tensor_out
template <class EngineIn1, class LayoutIn1,
class EngineIn2, class LayoutIn2,

View File

@ -576,7 +576,7 @@ depth(Layout<Shape,Stride> const& layout)
// Return the codomain shape of a mode
// @post size(coshape(@a a)) == cosize(@a a)
// @return C Coordinate with smallest elements such that that
// @return C Coordinate with smallest elements such that
// @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
// where sub_layout = get<Is...>(layout).
template <int... Is, class Shape, class Stride>

View File

@ -527,7 +527,7 @@ public:
auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
// Allocate the the accumulators for the (M,N) blk_shape
// Allocate the accumulators for the (M,N) blk_shape
//
// MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)

View File

@ -540,7 +540,7 @@ public:
auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
// Allocate the the accumulators for the (M,N) blk_shape
// Allocate the accumulators for the (M,N) blk_shape
Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
// Order two Math WG's MMA one after the other, helps hide Epilogue

View File

@ -347,7 +347,7 @@ public:
// The number of tiles for which reduction is required is either:
// (a) the total number of output tiles (in the case of split-K)
// (b) the number of stream-K tiles
// To calculate the the total number of output tiles in the split-K case, we
// To calcualte the total number of output tiles in the split-K case, we
// note that, in the split-K case, the units_per_problem_ member of Params will be
// the total number of output tiles.
auto reduction_tiles = params.splits_ > 1 ? params.units_per_problem_ : params.sk_tiles_;

View File

@ -556,7 +556,7 @@ public:
constexpr auto WarpThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
/// Divide a warp_group_tile into 8x8 warp_tiles to futher reduce the reg usage.
/// Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
/// Step 0: Step 1: Step 2: Step 3:
/// W0 W1 W2 W3 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
/// W1 W0 -- -- -- -- -- -- -- -- W3 W2 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

View File

@ -47,7 +47,7 @@ namespace cutlass {
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Wmma array type (WmmaFragmentArray holds elements of of type nvcuda::wmma::fragment)
/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
template <
/// Element type
typename T,