CUTLASS 2.7 (#318)

CUTLASS 2.7

Mainloop fusion for GEMM: summation over A or B
Strided DGRAD (optimized iterators)
Half-precision GELU_taylor activation functions
Use these when accumulation and epilogue compute types are all cutlass::half_t
Tuning and bug fixes to fused GEMM + GEMM example
Support for smaller than 128b aligned Convolutions: see examples
Caching of results to accelerate Convolution unit tests
Can be enabled or disabled by running cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF
Corrections and bug fixes reported by the CUTLASS community
Thank you for filing these issues!

authored-by: Haicheng Wu haichengw@nvidia.com, Manish Gupta manigupta@nvidia.com, Dustyn Blasig dblasig@nvidia.com, Andrew Kerr akerr@nvidia.com
This commit is contained in:
Manish Gupta
2021-09-20 11:02:22 -07:00
committed by GitHub
parent 9ac255863f
commit 2e07c4cc2f
62 changed files with 5611 additions and 186 deletions

View File

@ -51,7 +51,6 @@
#include "cutlass/matrix_coord.h"
#include "cutlass/tensor_coord.h"
#include "cutlass/layout/tensor.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/conv/convolution.h"
#include "cutlass/conv/conv2d_problem_size.h"

View File

@ -162,10 +162,11 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
# iterator algorithm (analytic and optimized)
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size
# by default, only generate the largest tile size, largest alignment, and optimized iterator
if manifest.args.kernels == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
iterator_algorithms = [IteratorAlgorithm.Optimized]
operations = []
@ -212,12 +213,21 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
# better for problem sizes with large activation channel count
swizzling_functor_strided_dgrad_ = SwizzlingFunctor.StridedDgradIdentity1
new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
if IteratorAlgorithm.Analytic in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
manifest.append(new_operation)
operations.append(new_operation)
manifest.append(new_operation)
operations.append(new_operation)
# Strided support for Optimized Dgrad
if IteratorAlgorithm.Optimized in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
manifest.append(new_operation)
operations.append(new_operation)
#
# Conv2d Wgrad
#
@ -246,34 +256,70 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
# iterator algorithm (analytic and optimized)
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size
# by default, only generate the largest tile size and optimized iterators
if manifest.args.kernels == '':
tile_descriptions = [tile_descriptions[0],]
iterator_algorithms = [IteratorAlgorithm.Optimized]
operations = []
# All tile sizes for Conv3dFprop and Conv3dWgrad
for tile in tile_descriptions:
for conv_kind in conv_kinds:
A = TensorDescription(element_a, layout, alignment)
B = TensorDescription(element_b, layout, alignment)
C = TensorDescription(element_c, layout, alignment_c)
#
# Conv3d Fprop
#
if ConvKind.Fprop in conv_kinds:
# Strided support for Analytic and Optimized Fprop
for iterator_algorithm in iterator_algorithms:
A = TensorDescription(element_a, layout, alignment)
B = TensorDescription(element_b, layout, alignment)
C = TensorDescription(element_c, layout, alignment_c)
# optimized conv3d iterator algorithm is only for Wgrad
if (iterator_algorithm == IteratorAlgorithm.Optimized) \
and ((conv_kind == ConvKind.Fprop) or (conv_kind == ConvKind.Dgrad)):
continue
# strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic)
new_operation = Conv3dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
new_operation = Conv3dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided)
manifest.append(new_operation)
operations.append(new_operation)
#
# Conv3d Wgrad
#
if ConvKind.Wgrad in conv_kinds:
# Strided support for Analytic and Optimized Wgrad
for iterator_algorithm in iterator_algorithms:
new_operation = Conv3dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
manifest.append(new_operation)
operations.append(new_operation)
# All tile sizes for Conv3dDgrad
for tile in tile_descriptions:
A = TensorDescription(element_a, layout, alignment)
B = TensorDescription(element_b, layout, alignment)
C = TensorDescription(element_c, layout, alignment_c)
#
# Conv3d Dgrad
#
if ConvKind.Dgrad in conv_kinds:
# Unity stride for Optimized Dgrad
new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor)
manifest.append(new_operation)
operations.append(new_operation)
# Strided support for Analytic Dgrad
# Conv3dDgrad has a naive strided support which does not cut down redundant MMAs
new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
manifest.append(new_operation)
operations.append(new_operation)
return operations
###################################################################################################
###################################################################################################
@ -1158,7 +1204,7 @@ def GenerateSM75_TensorOp_88128(manifest, args):
data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
data_type, alignment_constraints)
#
@ -1934,7 +1980,7 @@ def GenerateSM80_TensorOp_168256(manifest, args):
data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
data_type, alignment_constraints)
#

View File

@ -28,7 +28,6 @@
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cutlass/complex.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/library/library.h"