CUTLASS 2.10 (#615)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
This commit is contained in:
ANIKET SHIVAM
2022-09-03 15:48:46 -07:00
committed by GitHub
parent ca23ff7924
commit b72cbf957d
289 changed files with 43708 additions and 2513 deletions

View File

@ -149,6 +149,35 @@ class GemmOperation:
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
return self.procedural_name()
###################################################################################################
#
# Data structure modeling a grouped GEMM operation
#
###################################################################################################
#
class GroupedGemmOperation(GemmOperation):
#
def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
scheduler_mode = GroupScheduleMode.Device):
super().__init__(gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
epilogue_functor, swizzling_functor)
self.scheduler_mode = scheduler_mode
#
def procedural_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
base = super().procedural_name()
return SubstituteTemplate(
base + "_schedule${schedule}",
{
'schedule': ShortGroupScheduleModeNames[self.scheduler_mode]
})
###################################################################################################
#
# Emits single instances of a CUTLASS device-wide operator
@ -738,6 +767,7 @@ using ${operation_name}_base =
${epilogue_functor},
${swizzling_functor},
${stages},
${scheduler_mode},
${math_operation}
>::GemmKernel;
@ -817,6 +847,7 @@ ${compile_guard_end}
'align_b': str(operation.B.alignment),
'transform_a': ComplexTransformTag[operation.A.complex_transform],
'transform_b': ComplexTransformTag[operation.B.complex_transform],
'scheduler_mode': GroupScheduleModeTag[operation.scheduler_mode],
'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
}

View File

@ -180,7 +180,7 @@ def CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, \
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
new_operation = GemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
new_operation = GroupedGemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
manifest.append(new_operation)
@ -346,7 +346,7 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
# iterator algorithm (analytic and optimized)
#iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
iterator_algorithms = [IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size, largest alignment, and optimized iterator
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
@ -527,7 +527,7 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
alignment_c = min(8, alignment)
# iterator algorithm (analytic and optimized)
#iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
# iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
iterator_algorithms = [IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size and optimized iterators
@ -1677,7 +1677,6 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [8, 4, 2]
@ -1694,12 +1693,14 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
TileDescription([128, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 3, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
@ -1773,23 +1774,22 @@ def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [8]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
@ -1917,7 +1917,7 @@ def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
smem_usage = 164
alignment_constraints = [16,]
@ -1931,10 +1931,10 @@ def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
TileDescription([128, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -1986,22 +1986,21 @@ def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [16,]
tile_descriptions = [
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
@ -2102,8 +2101,6 @@ def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [32,]
for math_inst in math_instructions:
@ -2116,11 +2113,11 @@ def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
TileDescription([128, 64, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 256], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 256], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2173,21 +2170,19 @@ def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [32,]
tile_descriptions = [
TileDescription([ 64, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 256], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 128, 256], 6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 256], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
@ -2338,7 +2333,6 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [4, 2, 1]
@ -2354,11 +2348,11 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2424,7 +2418,6 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [4, 2, 1]
@ -2440,11 +2433,11 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2483,7 +2476,6 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [4, 2, 1]
@ -2497,8 +2489,8 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
TileDescription([ 64, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([ 64, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2583,23 +2575,22 @@ def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [4]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 32], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
TileDescription([128, 128, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([ 64, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
@ -3047,7 +3038,6 @@ def GenerateSM80_TensorOp_884(manifest, cuda_version):
min_cc = 80
max_cc = 1024
max_cc_smem_limited = 80
alignment_constraints = [1,]

View File

@ -456,7 +456,7 @@ OperationKindNames = {
#
class Target(enum.Enum):
library = enum_auto()
#
ArchitectureNames = {
50: 'maxwell',
60: 'pascal',
@ -466,6 +466,16 @@ ArchitectureNames = {
80: 'ampere',
}
#
SharedMemPerCC = {
70: 96, # 96KB of SMEM
72: 96, # 96KB of SMEM
75: 64, # 64KB of SMEM
80: 160, # 164KB of SMEM - 4KB reserved for the driver
86: 100, # 100KB of SMEM
87: 160, # 164KB of SMEM - 4KB reserved for the driver
}
###################################################################################################
#
@ -564,6 +574,23 @@ SwizzlingFunctorTag = {
SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
}
#
class GroupScheduleMode(enum.Enum):
Device = enum_auto(),
Host = enum_auto()
#
GroupScheduleModeTag = {
GroupScheduleMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
GroupScheduleMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
}
#
ShortGroupScheduleModeNames = {
GroupScheduleMode.Device: 'Device',
GroupScheduleMode.Host: 'Host'
}
###################################################################################################
#
@ -636,7 +663,6 @@ class MathInstruction:
self.opcode_class = opcode_class
self.math_operation = math_operation
#
class TileDescription:
@ -681,3 +707,29 @@ class TriangularTensorDescription:
self.complex_transform = complex_transform
###################################################################################################
#
def CalculateSmemUsage(operation):
cta_shape = operation.tile_description.threadblock_shape
stages = operation.tile_description.stages
if operation.operation_kind == OperationKind.Gemm and operation.gemm_kind == GemmKind.Sparse:
# Elements represented by 8 bits of metadata (based on 4:8, 2:4 or 1:2 sparsity)
if DataTypeSize[operation.A.element] == 32:
elements_per_8b_md = 2
elif DataTypeSize[operation.A.element] == 4:
elements_per_8b_md = 8
else:
elements_per_8b_md = 4
smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * (cta_shape[2] // 2) // 8 + \
DataTypeSize[operation.B.element] * cta_shape[1] * cta_shape[2] // 8 + \
cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
else:
# Few BLAS3 operations only have A tensor
smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * cta_shape[2] // 8 + \
DataTypeSize[operation.A.element] * cta_shape[1] * cta_shape[2] // 8
smem_usage = smem_per_stage * stages
return (smem_usage >> 10)
###################################################################################################

View File

@ -276,7 +276,8 @@ class Manifest:
for cc in self.compute_capabilities:
if cc >= operation.tile_description.minimum_compute_capability and \
cc <= operation.tile_description.maximum_compute_capability:
cc <= operation.tile_description.maximum_compute_capability and \
(cc not in SharedMemPerCC or SharedMemPerCC[cc] >= CalculateSmemUsage(operation)):
enabled = True
break

View File

@ -0,0 +1,120 @@
# PyCUTLASS: CUTLASS Python Interface
PyCUTLASS is a python interface of CUTLASS C++ template library. PyCUTLASS takes user-defined operation descriptions, emits C++ code, and compiles it with `nvcc` or `nvrtc`. It also provides wrappers for user-provide arguments from [numpy](https://numpy.org/), [torch](https://pytorch.org/), and [cupy](https://github.com/cupy/cupy) and encode them to kernel's parameters.
```python
import pycutlass
from pycutlass import *
import torch
pycutlass.get_memory_pool(2**8, 2**32)
math_inst = MathInstruction(
[1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
cutlass.OpClass.Simt, MathOperation.multiply_add
)
tile_description = TileDescription(
[128, 128, 8], 4, [2, 4, 1],
math_inst, 80, 80
)
A = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
B = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
C = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=cutlass.float32,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
pycutlass.compiler.add_module([operation,])
problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_D = torch.empty_like(tensor_C)
alpha = 1.0
beta = 0.0
arguments = GemmArguments(
operation=operation, problem_size=problem_size,
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
output_op=LinearCombinationFunctorArguments(alpha, beta),
gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
)
operation.run(arguments)
arguments.sync()
tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
assert torch.equal(tensor_D, tensor_D_ref)
```
PyCUTLASS also provides infrastructures for profiling, compiled artifact management, and pool memory manager
## Installation
### Using Docker
You can run the PyCUTLASS on NGC pytorch container.
```shell
docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:22.08-py3
```
PyCUTLASS requires additional dependency Boost C++ library, which can be installed with
```bash
apt-get update
apt-get -y install libboost-all-dev
```
### Environment variables
PyCUTLASSS requires two environment variables:
* `CUTLASS_PATH`: the root directory of CUTLASS
* `CUDA_INSTALL_PATH`: the directory where cuda toolkit is installed
After setting these two environment variables, PyCUTLASS can be installed with
```shell
cd $CUTLASS_PATH/tools/library/scripts/pycutlass && bash build.sh
```
## Examples
Examples can be found in `$CUTLASS_PATH/examples/40_cutlass_py`
## Test
The test cases are listed in `$CUTLASS_PATH//tools/library/scripts/pycutlass/test`. The unit test can be run with
```shell
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/unit && python test_sm80.py
```
## Troubleshooting
### Issue 1: permission denied
Building PyCUTLASS requires installing dependencies to python. So conda could an option if you don't have permission.
### Issue 2: rmm: module not found
PyCUTLASS manages the device memory with [RMM](https://github.com/rapidsai/rmm). Our `build.sh` automatically pull the [rmm branch-22.08](https://github.com/rapidsai/rmm/tree/branch-22.08) from github and build it from source. The rmm is allocated at `$CUTLASS_PATH/tools/library/scripts/pycutlass/rmm`. It requires `cmake > 3.20.1`. If the build fails, it can be manually fixed with the following steps:
```shell
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm && ./build.sh librmm rmm
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm/python
python setup.py build_ext --inplace
python setup.py install
```
To test whether rmm is successfully installed, try `import rmm`. For other issues related to rmm, please check https://github.com/rapidsai/rmm/issues.

View File

@ -0,0 +1,4 @@
pip install pybind11
git clone https://github.com/google/googletest.git
python setup.py install
python setup.py rmm

View File

@ -0,0 +1,2 @@
python setup.py develop
sphinx-build -b html docs/source/ docs/build/html

View File

@ -0,0 +1,52 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

View File

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

View File

@ -0,0 +1,93 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'PyCutlass'
copyright = '2022, Andrew Kerr; Zhaodong Chen; Haicheng Wu; Szymon Migacz; Graham Markall'
author = 'Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.duration',
'sphinx.ext.doctest',
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'enum_tools.autoenum',
'sphinx.ext.autosummary'
]
autosummary_generate = True
autosummary_imported_members = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'classic'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']

View File

@ -0,0 +1,13 @@
CONV2D Operation
================
.. autoclass:: pycutlass.Conv2dOperation
:special-members:
:members: run
:exclude-members: __weakref__, configuration_name, core_name, extended_name, procedural_name
.. autoclass:: pycutlass.Conv2dArguments
:special-members:
:members:
:exclude-members: initialize
:show-inheritance:

View File

@ -0,0 +1,2 @@
cutlass
=======

View File

@ -0,0 +1,6 @@
Descriptions
==============
.. autoclass:: pycutlass.TileDescription
:special-members:
:members:

View File

@ -0,0 +1,5 @@
Frontend
==============
.. autoclass:: pycutlass.NumpyFrontend
:members:

View File

@ -0,0 +1,18 @@
GEMM Operation
==============
.. autoclass:: pycutlass.GemmOperationUniversal
:special-members:
:members:
.. autoclass:: pycutlass.GemmOperationGrouped
:special-members:
:members:
.. autoclass:: pycutlass.GemmArguments
:special-members:
:members:
.. autoclass:: pycutlass.GemmGroupedArguments
:special-members:
:members:

View File

@ -0,0 +1,29 @@
.. PyCutlass documentation master file, created by
sphinx-quickstart on Sun Jun 19 12:05:42 2022.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to PyCutlass's documentation!
=====================================
.. toctree::
:maxdepth: 2
:caption: Contents:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
.. toctree::
types
cutlass
descriptor
frontend
gemm_op
conv2d_op

View File

@ -0,0 +1,6 @@
Types
========
.. autoenum:: pycutlass.OperationKind
:members:

View File

@ -0,0 +1,104 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from pycutlass import *
import pycutlass
from pycutlass.test.conv2d_testbed import Conv2dLauncher
if __name__ == "__main__":
pycutlass.get_memory_pool(2**33, 2**33)
pycutlass.compiler.nvcc()
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
profiler = Conv2dLauncher(operation, verification=False, profiling=True)
python_runtime = profiler.run(
problem_size = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(32, 224, 224, 128),
cutlass.Tensor4DCoord(128, 3, 3, 128),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
), split_k_mode=cutlass.conv.SplitKMode.Serial
)
cpp_runtime = profiler.run_cutlass_profiler(
problem_size = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(32, 224, 224, 128),
cutlass.Tensor4DCoord(128, 3, 3, 128),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
), split_k_mode=cutlass.conv.SplitKMode.Serial
)
print(cpp_runtime / python_runtime)

View File

@ -0,0 +1,91 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.test.gemm_testbed import GemmUniversalLauncher
if __name__ == '__main__':
pycutlass.get_memory_pool(2**32, 2**32)
pycutlass.compiler.nvcc()
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[256, 128, 32],
stages=3, warp_count=[4, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=4
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=4
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
profiler = GemmUniversalLauncher(operation, verification=False, profiling=True)
python_runtime = profiler.run(
mode=cutlass.gemm.Mode.Gemm,
problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096)
)
cpp_runtime = profiler.run_cutlass_profiler(
mode=cutlass.gemm.Mode.Gemm,
problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096),
)
print(cpp_runtime / python_runtime)

View File

@ -0,0 +1,9 @@
[build-system]
requires = [
"setuptools",
"scikit-build>0.13.1",
"pybind11",
"numpy<1.23",
"cmake>=3.20.1,!=3.23.0"
]

View File

@ -0,0 +1,79 @@
import distutils.cmd
from setuptools import setup
import setuptools.command.build_py
import os
# build rmm dependency
class BuildRMM(distutils.cmd.Command):
user_options = []
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
try:
import rmm
except ImportError:
print("installing rmm")
os.system("git clone -b branch-22.08 --recurse-submodules https://github.com/rapidsai/rmm.git")
os.chdir("./rmm")
os.system("./build.sh librmm rmm")
os.chdir("./python")
os.system("python setup.py build_ext --inplace")
os.system("python setup.py install")
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
ext_modules = []
try:
from pybind11.setup_helpers import Pybind11Extension, build_ext
include_dirs = [
cutlass_path + "/include",
cuda_install_path + "/include",
cutlass_path + "/tools/util/include",
cutlass_path + "/test",
cutlass_path + "/tools/library/scripts/pycutlass/googletest/googletest/include"
]
ext_modules = [
Pybind11Extension("cutlass",
["src/cpp/cutlass.cpp"],
include_dirs=include_dirs,
extra_compile_args=["-fpermissive"])
]
except ImportError:
pass
setup(
name="PyCutlass",
version="0.0.1",
author="Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall",
author_email="zhaodongc@nvidia.com",
description="Python interface for CUTLASS",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
package_dir={"": "src"},
packages=['pycutlass', 'pycutlass.utils', 'pycutlass.test'],
setup_requires=["pybind11", "numpy<1.23"],
install_requires=[
"numpy<1.23",
'pybind11',
'cuda-python<11.7.0',
'typeguard',
'bfloat16',
'typing',
'scikit-build'
],
cmdclass={
'rmm': BuildRMM
},
ext_modules=ext_modules,
python_requires=">=3.6",
)

View File

@ -0,0 +1,75 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief In-memory compiled artifact cache
*/
#include <pybind11/pybind11.h>
#include <string>
#include <unordered_map>
namespace py = pybind11;
namespace cutlass {
struct CompileCache {
public:
CompileCache() = default;
~CompileCache() = default;
using Cache = std::unordered_map<std::string, py::object>;
/// Check if the kernel has already been compiled
py::object at(const std::string &kernel) {
auto item = cache_.find(kernel);
if (item != cache_.end()) {
return item->second;
}
return py::none();
}
/// Insert a new compiled kernel for new configuration
void insert(const std::string &kernel, const py::object &compiled_kernel){
cache_.emplace(kernel, compiled_kernel);
}
const int64_t size() const { return cache_.size(); }
/// Clear the cache
void clear() { cache_.clear(); }
private:
Cache cache_;
};
} // namespace cutlass

View File

@ -0,0 +1,181 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief binding cutlass C++ APIs to python
*/
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "builtin_types.h"
#include "device_launch_parameters.h"
#include "stddef.h"
#include "cutlass/cutlass.h"
#include "include/conv/convolution.h"
#include "include/gemm/gemm.h"
#include "include/types.h"
#include "include/layout/layout.h"
#include "include/tensor_coord.h"
#include "include/arch.h"
#include "include/tensor_ref_view.h"
#include "include/swizzling.h"
#include "test/conv/convolution.h"
#include "test/gemm/gemm.h"
// Data Types
#include "library.h"
// compiler
#include "compiler.h"
namespace py = pybind11;
PYBIND11_MODULE(cutlass, m) {
// module doc
m.doc() = "cutlass C++ binding";
//
// Bind data type
//
bind_cutlass_types(m);
//
// Bind layout
//
bind_layout(m);
//
// Bind tensor coord
//
bind_tensor_coord(m);
//
// Bind tensor ref
//
bind_tensor_refs_and_views(m);
//
// Bind opcode
//
bind_opcode(m);
//
// Bind convolution
//
py::module_ conv_submodule = m.def_submodule("conv");
bind_convolution(conv_submodule);
//
// Bind gemm
//
py::module_ gemm_submodule = m.def_submodule("gemm");
bind_gemm(gemm_submodule);
//
// Bind swizzling
//
bind_threadblock_swizzle(m);
//
// Bind test units
//
py::module_ test = m.def_submodule("test");
py::module_ test_conv = test.def_submodule("conv");
bind_convolution_test(test_conv);
py::module_ test_gemm = test.def_submodule("gemm");
bind_gemm_test(test_gemm);
// data types
py::enum_<cutlass::DataType>(m, "dtype")
.value("b1", cutlass::DataType::kB1)
.value("u2", cutlass::DataType::kU2)
.value("u4", cutlass::DataType::kU4)
.value("u8", cutlass::DataType::kU8)
.value("u16", cutlass::DataType::kU16)
.value("u32", cutlass::DataType::kU32)
.value("u64", cutlass::DataType::kU64)
.value("s2", cutlass::DataType::kS2)
.value("s4", cutlass::DataType::kS4)
.value("s16", cutlass::DataType::kS16)
.value("s64", cutlass::DataType::kS64)
.value("cf16", cutlass::DataType::kCF16)
.value("cbf16", cutlass::DataType::kCBF16)
.value("cf32", cutlass::DataType::kCF32)
.value("ctf32", cutlass::DataType::kCTF32)
.value("cf64", cutlass::DataType::kCF64)
.value("cs2", cutlass::DataType::kCS2)
.value("cs4", cutlass::DataType::kCS4)
.value("cs8", cutlass::DataType::kCS8)
.value("cs16", cutlass::DataType::kCS16)
.value("cs32", cutlass::DataType::kCS32)
.value("cs64", cutlass::DataType::kCS64)
.value("cu2", cutlass::DataType::kCU2)
.value("cu4", cutlass::DataType::kCU4)
.value("cu8", cutlass::DataType::kCU8)
.value("cu16", cutlass::DataType::kCU16)
.value("cu32", cutlass::DataType::kCU32)
.value("cu64", cutlass::DataType::kCU64)
.value("invalid", cutlass::DataType::kInvalid);
// layout types
py::enum_<cutlass::LayoutType>(m, "layout")
.value("ColumnMajorInterleaved2", cutlass::LayoutType::kColumnMajorInterleaved2)
.value("RowMajorInterleaved2", cutlass::LayoutType::kRowMajorInterleaved2)
.value("ColumnMajorInterleaved64", cutlass::LayoutType::kColumnMajorInterleaved64)
.value("RowMajorInterleaved64", cutlass::LayoutType::kRowMajorInterleaved64)
.value("TensorNDHWC", cutlass::LayoutType::kTensorNDHWC)
.value("TensorNCHW", cutlass::LayoutType::kTensorNCHW)
.value("TensorNGHWC", cutlass::LayoutType::kTensorNGHWC)
.value("TensorNC64HW64", cutlass::LayoutType::kTensorNC64HW64)
.value("TensorC64RSK64", cutlass::LayoutType::kTensorC64RSK64);
// transform types
py::enum_<cutlass::ComplexTransform>(m, "complex_transform")
.value("none", cutlass::ComplexTransform::kNone)
.value("conj", cutlass::ComplexTransform::kConjugate);
//
// Compiler
//
py::class_<cutlass::CompileCache>(m, "CompileCache")
.def(py::init<>())
.def("at", &cutlass::CompileCache::at)
.def("insert", &cutlass::CompileCache::insert)
.def("size", &cutlass::CompileCache::size)
.def("clear", &cutlass::CompileCache::clear);
}

View File

@ -0,0 +1,59 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind opcode classes to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/arch/mma.h"
namespace py = pybind11;
namespace cutlass {
enum class OpcodeClass {
kSimt, kTensorOp, kWmmaTensorOp, kSparseTensorOp
};
}
void bind_opcode(py::module &m) {
py::enum_<cutlass::OpcodeClass>(m, "OpClass",
R"pbdoc(classification of math operators)pbdoc")
.value("Simt", cutlass::OpcodeClass::kSimt,
R"pbdoc(Tag classifying math operators as thread-level operations)pbdoc")
.value("TensorOp", cutlass::OpcodeClass::kTensorOp,
R"pbdoc(Tag classifing operators as Tensor Core operations)pbdoc")
.value("WmmaTensorOp", cutlass::OpcodeClass::kWmmaTensorOp,
R"pbdoc(Tag classifing operators as WMMA Tensor Core operations)pbdoc")
.value("SparseTensorOp", cutlass::OpcodeClass::kSparseTensorOp,
R"pbdoc(Tag classifing operators as sparseTensor Core operations)pbdoc");
}

View File

@ -0,0 +1,102 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Convolution problem sizes to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/conv/conv2d_problem_size.h"
namespace py = pybind11;
void bind_conv_problem_size(py::module &m) {
//
// Conv2d Problem Size:
// include/cutlass/conv/conv2d_problem_sizd.h
//
py::class_<cutlass::conv::Conv2dProblemSize>(m, "Conv2dProblemSize")
// constructors
.def(py::init<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, cutlass::conv::Mode, int, int>())
.def(py::init<cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::MatrixCoord, cutlass::MatrixCoord, cutlass::conv::Mode, int, int>())
// attribute accessors
.def_readwrite("N", &cutlass::conv::Conv2dProblemSize::N)
.def_readwrite("H", &cutlass::conv::Conv2dProblemSize::H)
.def_readwrite("W", &cutlass::conv::Conv2dProblemSize::W)
.def_readwrite("C", &cutlass::conv::Conv2dProblemSize::C)
.def_readwrite("P", &cutlass::conv::Conv2dProblemSize::P)
.def_readwrite("Q", &cutlass::conv::Conv2dProblemSize::Q)
.def_readwrite("K", &cutlass::conv::Conv2dProblemSize::K)
.def_readwrite("R", &cutlass::conv::Conv2dProblemSize::R)
.def_readwrite("S", &cutlass::conv::Conv2dProblemSize::S)
.def_readwrite("pad_h", &cutlass::conv::Conv2dProblemSize::pad_h)
.def_readwrite("pad_w", &cutlass::conv::Conv2dProblemSize::pad_w)
.def_readwrite("stride_h", &cutlass::conv::Conv2dProblemSize::stride_h)
.def_readwrite("stride_w", &cutlass::conv::Conv2dProblemSize::stride_w)
.def_readwrite("dilation_h", &cutlass::conv::Conv2dProblemSize::dilation_h)
.def_readwrite("dilation_w", &cutlass::conv::Conv2dProblemSize::dilation_w)
.def_readwrite("mode", &cutlass::conv::Conv2dProblemSize::mode)
.def_readwrite("split_k_slices", &cutlass::conv::Conv2dProblemSize::split_k_slices)
.def_readwrite("groups", &cutlass::conv::Conv2dProblemSize::groups)
// functions
.def("reset_split_k_slices", &cutlass::conv::Conv2dProblemSize::reset_split_k_slices)
.def("activation_extent", &cutlass::conv::Conv2dProblemSize::activation_extent)
.def("filter_extent", &cutlass::conv::Conv2dProblemSize::filter_extent)
.def("output_extent", &cutlass::conv::Conv2dProblemSize::output_extent)
.def("activation_size", &cutlass::conv::Conv2dProblemSize::activation_size)
.def("filter_size", &cutlass::conv::Conv2dProblemSize::filter_size)
.def("output_size", &cutlass::conv::Conv2dProblemSize::output_size);
// Get tensor size
m.def("implicit_gemm_tensor_a_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_a_size));
m.def("implicit_gemm_tensor_b_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_b_size));
m.def("implicit_gemm_tensor_c_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_c_size));
// Get tensor extent
m.def("implicit_gemm_tensor_a_extent",
py::overload_cast<
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
>(&cutlass::conv::implicit_gemm_tensor_a_extent));
m.def("implicit_gemm_tensor_b_extent",
py::overload_cast<
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
>(&cutlass::conv::implicit_gemm_tensor_b_extent));
m.def("implicit_gemm_tensor_c_extent",
py::overload_cast<
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
>(&cutlass::conv::implicit_gemm_tensor_c_extent));
m.def("implicit_gemm_problem_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize &>(&cutlass::conv::implicit_gemm_problem_size));
}

View File

@ -0,0 +1,91 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind convolution related enum types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "conv_problem_size.h"
#include "host.h"
#include "cutlass/conv/convolution.h"
namespace py = pybind11;
void bind_convolution(py::module &m) {
//
// Enumerate types
// cutlass/include/cutlass/conv/convolution.h
//
/// Convolutional operator
py::enum_<cutlass::conv::Operator>(m, "Operator", R"pbdoc(Convolutional operator)pbdoc")
.value("fprop", cutlass::conv::Operator::kFprop, "Forward propagation")
.value("dgrad", cutlass::conv::Operator::kDgrad, "Activation grad")
.value("wgrad", cutlass::conv::Operator::kWgrad, "Weight grad");
/// Distinguishes convolution from cross correlation
py::enum_<cutlass::conv::Mode>(m, "Mode")
.value("cross_correlation", cutlass::conv::Mode::kCrossCorrelation)
.value("convolution", cutlass::conv::Mode::kConvolution);
/// Selects among several implementation variants trading off performance with simplicity
py::enum_<cutlass::conv::IteratorAlgorithm>(m, "IteratorAlgorithm",
R"pbdoc(Selects among several implementation variants trading off performance with simplicity)pbdoc")
.value("analytic", cutlass::conv::IteratorAlgorithm::kAnalytic, R"pbdoc(functionally correct in all cases but lower performance)pbdoc")
.value("optimized", cutlass::conv::IteratorAlgorithm::kOptimized, R"pbdoc(optimized for R <= 32, S <= 32 and unity-stride dgrad)pbdoc")
.value("fixed_channels", cutlass::conv::IteratorAlgorithm::kFixedChannels, R"pbdoc(Analytic algorithm optimized for fixed channel count (C == AccessSize))pbdoc")
.value("few_channels", cutlass::conv::IteratorAlgorithm::kFewChannels, R"pbdoc(Analytic algorithm optimized for few channels (C divisible by AccessSize))pbdoc");
/// Distinguishes among partial specializations that accelerate certain problems where convolution
/// stride is unit.
py::enum_<cutlass::conv::StrideSupport>(m, "StrideSupport",
R"pbdoc(Distinguishes among partial specializations that accelerate certain problems where convolution
stride is unit.)pbdoc")
.value("strided", cutlass::conv::StrideSupport::kStrided, R"pbdoc(arbitrary convolution stride)pbdoc")
.value("unity", cutlass::conv::StrideSupport::kUnity, R"pbdoc(unit convolution stride)pbdoc");
/// Identifies split-K mode
py::enum_<cutlass::conv::SplitKMode>(m, "SplitKMode")
.value("None", cutlass::conv::SplitKMode::kNone)
.value("Serial", cutlass::conv::SplitKMode::kSerial)
.value("Parallel", cutlass::conv::SplitKMode::kParallel);
// Conv problem sizes
bind_conv_problem_size(m);
//
// host helper functions
//
py::module_ host_submodule = m.def_submodule("host");
bind_conv_host_helper(host_submodule);
}

View File

@ -0,0 +1,54 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind conv host helpers to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/util/host_reorder.h"
#include "cutlass/layout/tensor.h"
namespace py = pybind11;
void bind_conv_host_helper(py::module &m) {
/// reorder operand B for interleaved layout
m.def("reorder_convK", [](
cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> dest,
cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> src,
cutlass::conv::Operator conv_op, const cutlass::conv::Conv2dProblemSize & problem_size) {
cutlass::gemm::GemmCoord implicit_problem_size = cutlass::conv::implicit_gemm_problem_size(conv_op, problem_size);
cutlass::reorder_convK<32>(dest, src, implicit_problem_size);
});
}

View File

@ -0,0 +1,77 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm related enum types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/gemm/gemm.h"
#include "host.h"
namespace py = pybind11;
void bind_gemm(py::module &m) {
//
// Enumerate types
// cutlass/gemm/gemm.h
py::enum_<cutlass::gemm::GemmUniversalMode>(m, "Mode")
.value("Gemm", cutlass::gemm::GemmUniversalMode::kGemm, "Ordinary GEMM & GEMM Split-K serial")
.value("GemmSplitKParallel", cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, "GEMM Split-K parallel")
.value("Batched", cutlass::gemm::GemmUniversalMode::kBatched, "Batched GEMM")
.value("Array", cutlass::gemm::GemmUniversalMode::kArray)
.value("Invalid", cutlass::gemm::GemmUniversalMode::kInvalid);
/// GemmCoord is a structure that specifies a location within the coordiate space of a GEMM problem
py::class_<cutlass::gemm::GemmCoord>(m, "GemmCoord")
.def(py::init<int, int, int>())
.def("m", py::overload_cast<>(&cutlass::gemm::GemmCoord::m))
.def("n", py::overload_cast<>(&cutlass::gemm::GemmCoord::n))
.def("k", py::overload_cast<>(&cutlass::gemm::GemmCoord::k))
// get tensor coords
.def("mk",
[](const cutlass::gemm::GemmCoord & problem_size) {
return cutlass::MatrixCoord(problem_size.mk());
})
.def("kn",
[](const cutlass::gemm::GemmCoord & problem_size) {
return cutlass::MatrixCoord(problem_size.kn());
})
.def("mn",
[](const cutlass::gemm::GemmCoord & problem_size) {
return cutlass::MatrixCoord(problem_size.mn());
});
py::module_ host_submodule = m.def_submodule("host");
bind_gemm_host_helper(host_submodule);
}

View File

@ -0,0 +1,47 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm host helpers to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/util/host_reorder.h"
#include "cutlass/layout/tensor.h"
namespace py = pybind11;
void bind_gemm_host_helper(py::module &m) {
m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::RowMajorInterleaved<32>>);
m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::ColumnMajorInterleaved<32>>);
}

View File

@ -0,0 +1,47 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind CUTLASS layouts to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "tensor.h"
#include "matrix.h"
namespace py = pybind11;
void bind_layout(py::module &m) {
bind_tensor_layout(m);
bind_matrix_layout(m);
}

View File

@ -0,0 +1,87 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Matrix layouts to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/layout/matrix.h"
namespace py = pybind11;
void bind_matrix_layout(py::module &m) {
//
// Matrix layouts
// cutlass/layout/matrix.h
//
py::class_<cutlass::layout::RowMajor>(m, "RowMajor", R"pbdoc(
Mapping function for row-major matrices.
)pbdoc")
.def_static("packed", &cutlass::layout::RowMajor::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", [](const cutlass::layout::RowMajor & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
py::class_<cutlass::layout::ColumnMajor>(m, "ColumnMajor", R"pbdoc(
Mapping function for column-major matrices.
)pbdoc")
.def_static("packed", &cutlass::layout::ColumnMajor::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc" )
.def("stride", [](const cutlass::layout::ColumnMajor & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
py::class_<cutlass::layout::RowMajorInterleaved<32>>(m, "RowMajorInterleaved32",
R"pbdoc(Mapping function for interleaved matrices. Matrix is structured
as row-major arrangement of fixed-size columns 32)pbdoc")
.def_static("packed", &cutlass::layout::RowMajorInterleaved<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", [](const cutlass::layout::RowMajorInterleaved<32> & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
py::class_<cutlass::layout::ColumnMajorInterleaved<32>>(m, "ColumnMajorInterleaved32",
R"pbdoc(Mapping function for interleaved matrices. Matrix is structured
as column-major arrangement of fixed-size rows 32)pbdoc")
.def_static("packed", &cutlass::layout::ColumnMajorInterleaved<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", [](const cutlass::layout::ColumnMajorInterleaved<32> & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
}

View File

@ -0,0 +1,74 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Tensor layouts to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/layout/tensor.h"
namespace py = pybind11;
void bind_tensor_layout(py::module &m) {
//
// Tensor layouts
// cutlass/include/cutlass/layout/tensor.h
//
/// Mapping function for 4-D NHWC tensors.
py::class_<cutlass::layout::TensorNHWC>(m, "TensorNHWC",
R"pbdoc(Mapping function for 4-D NHWC tensors)pbdoc")
.def_static("packed", &cutlass::layout::TensorNHWC::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed NHWC tensor)pbdoc")
.def("stride", py::overload_cast<>(&cutlass::layout::TensorNHWC::stride),
R"pbdoc(Returns the stride of the layout)pbdoc");
/// Mapping function for 4-D NC/xHWx tensors.
py::class_<cutlass::layout::TensorNCxHWx<32>>(m, "TensorNC32HW32",
R"pbdoc(Mapping function for 4-D NC/32HW32 tensors)pbdoc")
.def_static("packed", &cutlass::layout::TensorNCxHWx<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", py::overload_cast<>(&cutlass::layout::TensorNCxHWx<32>::stride),
R"pbdoc(Returns the stride of the layout)pbdoc");
/// Mapping function for 4-D CxRSKx tensors.
py::class_<cutlass::layout::TensorCxRSKx<32>>(m, "TensorC32RSK32",
R"pbdoc(Mapping function for 4-D C32RSK32 tensors)pbdoc")
.def_static("packed", &cutlass::layout::TensorCxRSKx<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", py::overload_cast<>(&cutlass::layout::TensorCxRSKx<32>::stride),
R"pbdoc(Returns the stride of the layout)pbdoc");
}

View File

@ -0,0 +1,152 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind threadblock swizzling to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
#include "cutlass/conv/threadblock/threadblock_swizzle.h"
#include <boost/core/demangle.hpp>
#include <cuda_runtime.h>
namespace py = pybind11;
template<typename T>
void bind_identity_swizzle(py::module & m, std::string name) {
py::class_<T>(m, name.c_str(),
R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc")
.def(py::init<>())
.def("get_tiled_shape",
py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: gemm(M, N, K)
:type problem_size: :class:`cutlass.gemm.GemmCoord`
)pbdoc")
.def("get_tiled_shape",
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
)pbdoc")
.def("get_tiled_shape",
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv3dProblemSize&, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: Implicit gemm problem size conv_operator(NZPQK, NDHWC, KTRSC)
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
)pbdoc")
// TODO: the returned dim3 is not usable in python
.def("get_grid_shape", &T::get_grid_shape,
py::arg("tiled_shape"),
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
.def("tag", [](const T & swizzle){
return boost::core::demangle(typeid(T).name());
}, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc");
}
template<typename T>
void bind_swizzle(py::module & m, std::string name, std::string doc) {
py::class_<T>(m, name.c_str(), doc.c_str())
.def(py::init<>())
.def("get_tiled_shape",
py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: gemm(M, N, K)
:type problem_size: :class:`cutlass.gemm.GemmCoord`
)pbdoc")
.def("get_grid_shape", &T::get_grid_shape,
py::arg("tiled_shape"),
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
.def("tag", [](const T & swizzle){
return boost::core::demangle(typeid(T).name());
}, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc");
}
template<typename T>
void bind_dgrad_swizzle(py::module & m, std::string name) {
py::class_<T>(m, name.c_str(),
R"pbdoc(Threadblock swizzling function for strided dgrad convolution)pbdoc")
.def(py::init<>())
.def("get_tiled_shape",
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
)pbdoc")
.def("get_grid_shape", [](const T & swizzle, cutlass::gemm::GemmCoord tiled_shape) {
return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
}, py::arg("tiled_shape"),
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
.def("tag", [](const T & swizzle){
return boost::core::demangle(typeid(T).name());
}, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc");
}
void bind_threadblock_swizzle(py::module &m) {
py::class_<dim3>(m, "dim3",
R"pbdoc(A int3 type xyz contains three integers)pbdoc")
.def(py::init<int, int, int>(),
py::arg("x"), py::arg("y"), py::arg("z"))
.def_readwrite("x", &dim3::x, R"pbdoc(get value x)pbdoc")
.def_readwrite("y", &dim3::y, R"pbdoc(get value y)pbdoc")
.def_readwrite("z", &dim3::z, R"pbdoc(get value z)pbdoc");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>>(m, "IdentitySwizzle1");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>>(m, "IdentitySwizzle2");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>>(m, "IdentitySwizzle4");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>>(m, "IdentitySwizzle8");
bind_swizzle<cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle>(m, "HorizontalSwizzle", R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc");
bind_swizzle<cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle>(m, "BatchedIdentitySwizzle", R"pbdoc(Threadblock swizzling function for batched GEMMs)pbdoc");
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>>(m, "StridedDgradIdentitySwizzle1");
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>>(m, "StridedDgradIdentitySwizzle4");
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle>(m, "StridedDgradHorizontalSwizzle");
}

View File

@ -0,0 +1,72 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Tensor Coord to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/tensor_coord.h"
namespace py = pybind11;
void bind_tensor_coord(py::module &m) {
//
// Tensor Coords
// cutlass/include/cutlass/tensor_coord.h
//
/// Defines a canonical 4D coordinate used by tensor operations.
py::class_<cutlass::Tensor4DCoord>(m, "Tensor4DCoord",
R"pbdoc(Defines a canonical 4D coordinate used by tensor operations)pbdoc")
.def(py::init<int, int, int, int>(),
py::arg("n"), py::arg("h"), py::arg("w"), py::arg("c"),
R"pbdoc(Helper to construct from N, H, W, and C)pbdoc");
py::class_<cutlass::Coord<3>>(m, "Tensor3DCoord",
R"pbdoc(Defines a canonical 3D coordinate used by tensor operations)pbdoc")
.def("at", py::overload_cast<int>(&cutlass::Coord<3>::at),
py::arg("dim"),
R"pbdoc(Gets the index of a given Coord element)pbdoc");
// Matrix Size
py::class_<cutlass::MatrixCoord>(m, "MatrixCoord",
R"pbdoc(MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.)pbdoc")
.def(py::init<int, int>(),
py::arg("row"), py::arg("column"), R"pbdoc(Helper to construct from a row and column)pbdoc")
.def("row", py::overload_cast<>(&cutlass::MatrixCoord::row),
R"pbdoc(Returns the row of the coordinate)pbdoc")
.def("column", py::overload_cast<>(&cutlass::MatrixCoord::column),
R"pbdoc(Returns the column of the coordinate)pbdoc");
}

View File

@ -0,0 +1,102 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSE<cutlass::TensorRef<QUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind TensorRef and View to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/tensor_ref.h"
#include "cutlass/tensor_view.h"
#include "types.h"
template<typename T, typename L, typename TF>
void bind_tensor_ref_view(py::module &m, std::string name) {
py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
.def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
T* ptr = reinterpret_cast< T*>(address);
new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
})
.def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
T* ptr = tensor_ref.data();
return int64_t(ptr);
})
.def("layout", py::overload_cast<>(&cutlass::TensorRef<T, L>::layout));
m.def("get_tensor_ref", [](int64_t address, TF data, const L& layout_) {
T* ptr = reinterpret_cast<T*>(address);
cutlass::TensorRef<T, L> tensor_ref = cutlass::TensorRef<T, L>(ptr, layout_);
return tensor_ref;
});
py::class_<cutlass::TensorView<T, L>>(m, ("TensorView" + name).c_str())
.def(py::init<const cutlass::TensorRef<T, L>&, const typename L::TensorCoord &>());
}
void bind_tensor_refs_and_views(py::module &m) {
/// float
bind_tensor_ref_view<float, cutlass::layout::RowMajor, cutlass::float32>(m, "F32RowMajor");
bind_tensor_ref_view<float, cutlass::layout::ColumnMajor, cutlass::float32>(m, "F32ColumnMajor");
bind_tensor_ref_view<float, cutlass::layout::TensorNHWC, cutlass::float32>(m, "F32NHWC");
/// double
bind_tensor_ref_view<double, cutlass::layout::RowMajor, cutlass::float64>(m, "F64RowMajor");
bind_tensor_ref_view<double, cutlass::layout::ColumnMajor, cutlass::float64>(m, "F64ColumnMajor");
bind_tensor_ref_view<double, cutlass::layout::TensorNHWC, cutlass::float64>(m, "F64NHWC");
// half_t
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t>(m, "F16RowMajor");
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t>(m, "F16ColumnMajor");
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::TensorNHWC, cutlass::half_t>(m, "F16NHWC");
// bfloat16
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t>(m, "BF16RowMajor");
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::ColumnMajor, cutlass::bfloat16_t>(m, "BF16ColumnMajor");
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::TensorNHWC, cutlass::bfloat16_t>(m, "BF16NHWC");
// int8_t
bind_tensor_ref_view<int8_t, cutlass::layout::RowMajorInterleaved<32>, cutlass::int8>(m, "S8RowMajorInterleaved32");
bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajorInterleaved<32>, cutlass::int8>(m, "S8ColumnMajorInterleaved32");
bind_tensor_ref_view<int8_t, cutlass::layout::RowMajor, cutlass::int8>(m, "S8RowMajor");
bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajor, cutlass::int8>(m, "S8ColumnMajor");
bind_tensor_ref_view<int8_t, cutlass::layout::TensorNHWC, cutlass::int8>(m, "S8NHWC");
bind_tensor_ref_view<int8_t, cutlass::layout::TensorNCxHWx<32>, cutlass::int8>(m, "S8NC32HW32");
bind_tensor_ref_view<int8_t, cutlass::layout::TensorCxRSKx<32>, cutlass::int8>(m, "S8C32RSK32");
// int32_t
bind_tensor_ref_view<int32_t, cutlass::layout::RowMajor, cutlass::int32>(m, "S32RowMajor");
bind_tensor_ref_view<int32_t, cutlass::layout::ColumnMajor, cutlass::int32>(m, "S32ColumnMajor");
bind_tensor_ref_view<int32_t, cutlass::layout::TensorNHWC, cutlass::int32>(m, "S32NHWC");
}

View File

@ -0,0 +1,146 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind CUTLASS types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/half.h"
namespace py = pybind11;
namespace cutlass {
/// IEEE 32-bit signed integer
struct alignas(1) int8 {
int8_t storage;
explicit int8(int x) {
storage = int8_t(x);
}
explicit int8(float x) {
storage = int8_t(x);
}
int8_t c_value(){return storage;}
};
/// IEEE 32-bit signed integer
struct alignas(4) int32 {
int storage;
explicit int32(int x) {
storage = x;
}
explicit int32(float x) {
storage = int(x);
}
int c_value(){return storage;}
};
/// IEEE single-precision floating-point type
struct alignas(4) float32 {
float storage;
explicit float32(float x) {
storage = x;
}
explicit float32(int x) {
storage = float(x);
}
float c_value(){return storage;}
};
/// IEEE double-precision floating-point type
struct alignas(4) float64 {
double storage;
explicit float64(float x) {
storage = double(x);
}
explicit float64(int x) {
storage = double(x);
}
double c_value(){return storage;}
};
}
void bind_cutlass_types(py::module &m) {
// s8
py::class_<cutlass::int8>(m, "int8")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::int8::storage)
.def("value", &cutlass::int8::c_value);
// s32
py::class_<cutlass::int32>(m, "int32")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::int32::storage)
.def("value", &cutlass::int32::c_value);
// f16
py::class_<cutlass::half_t>(m, "float16")
.def(py::init<float>())
.def(py::init<double>())
.def(py::init<int>())
.def(py::init<unsigned>())
.def_readwrite("storage", &cutlass::half_t::storage)
.def("value", [](const cutlass::half_t& value) {return value;});
// bf16
py::class_<cutlass::bfloat16_t>(m, "bfloat16")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::bfloat16_t::storage)
.def("value", [](const cutlass::bfloat16_t& value) {return value;});
// f32
py::class_<cutlass::float32>(m, "float32")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::float32::storage)
.def("value", &cutlass::float32::c_value);
// tf32
py::class_<cutlass::tfloat32_t>(m, "tfloat32")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::tfloat32_t::storage)
.def("value", [](const cutlass::tfloat32_t& value) {return value;});
// f64
py::class_<cutlass::float64>(m, "float64")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::float64::storage)
.def("value", &cutlass::float64::c_value);
}

View File

@ -0,0 +1,32 @@
#include <cutlass/complex.h>
namespace cutlass {
/// ENUM class for datatypes
enum class DataType {
kB1, kU2, kU4, kU8,
kU16, kU32, kU64, kS2,
kS4, kS8, kS16, kS32,
kS64, kF16, kBF16, kF32,
kTF32, kF64, kCF16, kCBF16,
kCF32, kCTF32, kCF64, kCS2,
kCS4, kCS8, kCS16, kCS32,
kCS64, kCU2, kCU4, kCU8,
kCU16, kCU32, kCU64, kInvalid
};
/// ENUM class for LayoutTypes
enum class LayoutType {
kColumnMajor, kRowMajor,
kColumnMajorInterleaved2, kRowMajorInterleaved2,
kColumnMajorInterleaved32, kRowMajorInterleaved32,
kColumnMajorInterleaved64, kRowMajorInterleaved64,
kTensorNHWC, kTensorNDHWC, kTensorNCHW, kTensorNGHWC,
kTensorNC32HW32, kTensorNC64HW64, kTensorC32RSK32,
kTensorC64RSK64
};
/// ENUM class for opcode class
} // namespace cutlass

View File

@ -0,0 +1,54 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind convolution problems to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "unit/conv/device/conv2d_problems.h"
#include "cutlass/conv/conv2d_problem_size.h"
namespace py = pybind11;
PYBIND11_MAKE_OPAQUE(std::vector<cutlass::conv::Conv2dProblemSize>);
void bind_conv_problem_size_test(py::module &m) {
py::bind_vector<std::vector<cutlass::conv::Conv2dProblemSize>>(m, "Conv2dProblemVector")
.def("size", &std::vector<cutlass::conv::Conv2dProblemSize>::size);
// Get Conv2d problem sizes
py::class_<test::conv::device::TestbedConv2dProblemSizes>(m, "TestbedConv2dProblemSizes")
.def(py::init<int>())
.def_readonly("conv2d_default_sizes", &test::conv::device::TestbedConv2dProblemSizes::conv2d_default_sizes);
}

View File

@ -0,0 +1,49 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind convolution related types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "conv_problems.h"
#include "host.h"
namespace py = pybind11;
void bind_convolution_test(py::module &m) {
// Conv problem sizes
bind_conv_problem_size_test(m);
py::module_ host_submodule = m.def_submodule("host");
bind_conv_host_references(host_submodule);
}

View File

@ -0,0 +1,180 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Convolution host test helpers to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "unit/conv/device/cache_testbed_output.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/reference/host/tensor_compare.h"
namespace py = pybind11;
template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
void bind_conv2d_host(py::module &m) {
m.def("conv2d", \
&cutlass::reference::host::Conv2d< \
Ta, La, Tb, Lb, Tc, Lc, Te, Tacc>);
m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
}
template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
void bind_conv2d_host_sat(py::module &m) {
m.def("conv2d", \
&cutlass::reference::host::Conv2d< \
Ta, La, Tb, Lb, Tc, Lc, Te, Tacc, cutlass::NumericConverterClamp<Tc, Te>>);
m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
}
template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
void bind_conv2d_host_nhwc(py::module &m) {
bind_conv2d_host<
Ta, cutlass::layout::TensorNHWC,
Tb, cutlass::layout::TensorNHWC,
Tc, cutlass::layout::TensorNHWC,
Tacc, Te>(m);
}
template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
void bind_conv2d_host_nc32hw32(py::module &m) {
bind_conv2d_host_sat<
Ta, cutlass::layout::TensorNCxHWx<32>,
Tb, cutlass::layout::TensorCxRSKx<32>,
Tc, cutlass::layout::TensorNCxHWx<32>,
Tacc, Te>(m);
}
template<typename T, typename Layout>
void bind_tensor_equals(py::module &m) {
m.def("equals", py::overload_cast<
const cutlass::TensorView<T, Layout>&, const cutlass::TensorView<T, Layout>&>(
&cutlass::reference::host::TensorEquals<T, Layout>
));
}
#define BIND_TENSOR_HASH(Element, Layout) { \
m.def("TensorHash", &test::conv::device::TensorHash<Element, Layout>, py::arg("view"), py::arg("hash") = test::conv::device::CRC32(), py::arg("crc")=uint32_t()); \
}
void bind_conv_host_references(py::module &m) {
//
// Conv2d reference on host
// tools/util/include/cutlass/util/reference/host/convolution.h
/// double
bind_conv2d_host_nhwc<double, double, double, double, double>(m);
/// float
bind_conv2d_host_nhwc<float, float, float, float, float>(m);
/// half
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, float>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, float>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, float>(m);
/// bfloat16
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, cutlass::bfloat16_t>(m);
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, cutlass::bfloat16_t>(m);
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
/// s8
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
//
// Compare whether two tensors are equal
//
/// double
bind_tensor_equals<double, cutlass::layout::TensorNHWC>(m);
/// float
bind_tensor_equals<float, cutlass::layout::TensorNHWC>(m);
/// half
bind_tensor_equals<cutlass::half_t, cutlass::layout::TensorNHWC>(m);
/// bfloat16
bind_tensor_equals<cutlass::bfloat16_t, cutlass::layout::TensorNHWC>(m);
/// s32
bind_tensor_equals<int32_t, cutlass::layout::TensorNHWC>(m);
bind_tensor_equals<int32_t, cutlass::layout::TensorNCxHWx<32>>(m);
/// s8
bind_tensor_equals<int8_t, cutlass::layout::TensorNHWC>(m);
bind_tensor_equals<int8_t, cutlass::layout::TensorNCxHWx<32>>(m);
/// Cache
py::class_<test::conv::device::CachedTestKey>(m, "CachedTestKey")
.def(py::init<>())
.def(py::init<std::string, std::string, std::string, uint32_t, uint32_t, uint32_t>());
py::class_<test::conv::device::CachedTestResult>(m, "CachedTestResult")
.def(py::init<>())
.def(py::init<uint32_t>())
.def_readwrite("D", &test::conv::device::CachedTestResult::D);
py::class_<test::conv::device::CachedTestResultListing>(m, "CachedTestResultListing")
.def(py::init<const std::string &>())
.def("find", &test::conv::device::CachedTestResultListing::find)
.def("append", &test::conv::device::CachedTestResultListing::append)
.def("write", &test::conv::device::CachedTestResultListing::write);
py::class_<test::conv::device::CRC32>(m, "CRC32")
.def(py::init<>());
BIND_TENSOR_HASH(double, cutlass::layout::TensorNHWC)
BIND_TENSOR_HASH(float, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(cutlass::half_t, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(cutlass::bfloat16_t, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(int32_t, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(int8_t, cutlass::layout::TensorNCxHWx<32>);
}

View File

@ -0,0 +1,45 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm test to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "host.h"
namespace py = pybind11;
void bind_gemm_test(py::module &m) {
py::module_ host_submodule = m.def_submodule("host");
bind_gemm_host_reference(host_submodule);
}

View File

@ -0,0 +1,431 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm test host functions to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/cutlass.h"
#include "cutlass/util/reference/host/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/host_reorder.h"
#include "cutlass/functional.h"
namespace py = pybind11;
template<
typename ElementA, typename LayoutA,
typename ElementB, typename LayoutB,
typename ElementC, typename LayoutC,
typename AccumulatorType, typename ComputeType,
typename InnerProductOp>
void bind_host_gemm_saturate(py::module &m) {
m.def("gemm_saturate", py::overload_cast<
cutlass::gemm::GemmCoord, ComputeType,
cutlass::TensorRef<ElementA, LayoutA>,
cutlass::TensorRef<ElementB, LayoutB>,
ComputeType,
cutlass::TensorRef<ElementC, LayoutC>,
cutlass::TensorRef<ElementC, LayoutC>,
AccumulatorType>(
&cutlass::reference::host::compute_gemm<
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
ComputeType,
AccumulatorType,
InnerProductOp,
cutlass::NumericConverterClamp<ElementC, AccumulatorType>>
));
}
template<
typename ElementA, typename LayoutA,
typename ElementB, typename LayoutB,
typename ElementC, typename LayoutC,
typename AccumulatorType, typename ComputeType,
typename InnerProductOp>
void bind_host_gemm(py::module &m) {
m.def("gemm", py::overload_cast<
cutlass::gemm::GemmCoord, ComputeType,
cutlass::TensorRef<ElementA, LayoutA>,
cutlass::TensorRef<ElementB, LayoutB>,
ComputeType,
cutlass::TensorRef<ElementC, LayoutC>,
cutlass::TensorRef<ElementC, LayoutC>,
AccumulatorType>(
&cutlass::reference::host::compute_gemm<
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
ComputeType,
AccumulatorType,
InnerProductOp,
cutlass::NumericConverter<ElementC, AccumulatorType>>
));
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add(py::module &m) {
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add_saturate(py::module &m) {
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add_interleaved(py::module &m) {
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add_saturate_interleaved(py::module &m) {
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
#define BIND_TENSOR_EQUAL(Element, Layout) { \
m.def("equals", py::overload_cast< \
const cutlass::TensorView<Element, Layout>&, const cutlass::TensorView<Element, Layout>&>( \
&cutlass::reference::host::TensorEquals<Element, Layout>)); \
}
void bind_gemm_host_reference(py::module &m) {
/// double
bind_host_gemm_multiply_add<double, double, double, double, double>(m);
/// float
bind_host_gemm_multiply_add<float, float, float, float, float>(m);
/// half_t
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, float, float>(m);
/// bfloat16
bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
/// s8
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
// float
BIND_TENSOR_EQUAL(float, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(float, cutlass::layout::ColumnMajor);
// double
BIND_TENSOR_EQUAL(double, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(double, cutlass::layout::ColumnMajor);
// half_t
BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::ColumnMajor);
// bfloat16
BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::ColumnMajor);
// int32_t
BIND_TENSOR_EQUAL(int32_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(int32_t, cutlass::layout::ColumnMajor);
// int8_t
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajor);
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajorInterleaved<32>);
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajorInterleaved<32>);
}

View File

@ -0,0 +1,31 @@
from pycutlass.type import *
from pycutlass.tensor_ref import *
from pycutlass.operation import *
from pycutlass.epilogue import *
from pycutlass.compiler import ArtifactManager
from pycutlass.memory_manager import *
from pycutlass.arguments import *
from pycutlass.library import *
from pycutlass.c_types import *
from pycutlass.gemm_operation import *
from pycutlass.conv2d_operation import *
from pycutlass.compiler import *
from pycutlass.utils import *
from pycutlass.frontend import *
from pycutlass.reduction_operation import *
from pycutlass.compiler import *
# module-wide variables
import sys
this = sys.modules[__name__]
# artifact manager
this.compiler = ArtifactManager()
def get_memory_pool(init_pool_size=0, max_pool_size=2**34):
this.memory_pool = PoolMemoryManager(
init_pool_size=init_pool_size,
max_pool_size=max_pool_size
)
return this.memory_pool

View File

@ -0,0 +1,104 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from .frontend import CupyFrontend
from typeguard import typechecked
from pycutlass.frontend import *
from typing import Union
import numpy as np
from cuda import cuda
try:
import torch
torch_available = True
except ImportError:
torch_available = False
from cuda import cudart
try:
import cupy as cp
cupy_available = True
except ImportError:
cupy_available = False
# @typechecked
class ArgumentBase:
"""
Base class for operation arguments
"""
def __init__(self,
A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
**kwargs) -> None:
# preprocessing input tensors
if isinstance(A, np.ndarray):
self.host_D = D
self.buffer_A = NumpyFrontend.argument(A, False)
self.buffer_B = NumpyFrontend.argument(B, False)
self.buffer_C = NumpyFrontend.argument(C, False)
self.buffer_D = NumpyFrontend.argument(D, True)
self.ptr_A = self.buffer_A.ptr
self.ptr_B = self.buffer_B.ptr
self.ptr_C = self.buffer_C.ptr
self.ptr_D = self.buffer_D.ptr
elif torch_available and isinstance(A, torch.Tensor):
self.ptr_A = TorchFrontend.argument(A)
self.ptr_B = TorchFrontend.argument(B)
self.ptr_C = TorchFrontend.argument(C)
self.ptr_D = TorchFrontend.argument(D)
elif isinstance(A, cuda.CUdeviceptr):
self.ptr_A = A
self.ptr_B = B
self.ptr_C = C
self.ptr_D = D
elif cupy_available and isinstance(A, cp.ndarray):
self.ptr_A = CupyFrontend.argument(A)
self.ptr_B = CupyFrontend.argument(B)
self.ptr_C = CupyFrontend.argument(C)
self.ptr_D = CupyFrontend.argument(D)
else:
raise TypeError(
"Unsupported Frontend. Only support numpy and torch")
def sync(self, stream_sync=True):
if stream_sync:
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if hasattr(self, "host_D"):
err, = cuda.cuMemcpyDtoH(
self.host_D, self.ptr_D, self.host_D.size * self.host_D.itemsize)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))

View File

@ -0,0 +1,252 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import ctypes
from pycutlass.library import *
# 12B
class GemmCoord_(ctypes.Structure):
_fields_ = [
("m", ctypes.c_int),
("n", ctypes.c_int),
("k", ctypes.c_int)
]
def __init__(self, gemm_coord) -> None:
for field_name, _ in self._fields_:
setattr(self, field_name, getattr(gemm_coord, field_name)())
class MatrixCoord_(ctypes.Structure):
_fields_ = [
("row", ctypes.c_int),
("column", ctypes.c_int)
]
dtype2ctype = {
cutlass.float16: ctypes.c_uint16,
cutlass.float32: ctypes.c_float,
cutlass.float64: ctypes.c_double,
cutlass.int32: ctypes.c_int32
}
def get_epilogue_output_op(element_compute_):
element_compute = dtype2ctype[element_compute_]
class _EpilogueOutputOpParams(ctypes.Structure):
_fields_ = [
("alpha", element_compute),
("beta", element_compute),
("alpha_ptr", ctypes.c_void_p),
("beta_ptr", ctypes.c_void_p)
]
return _EpilogueOutputOpParams
def get_gemm_arguments(element_compute_):
_EpilogueOutputOpParams = get_epilogue_output_op(element_compute_)
class _GemmArguments(ctypes.Structure):
_fields_ = [
("mode", ctypes.c_int),
("problem_size", GemmCoord_),
("batch_count", ctypes.c_int),
("epilogue", _EpilogueOutputOpParams),
("ptr_A", ctypes.c_void_p),
("ptr_B", ctypes.c_void_p),
("ptr_C", ctypes.c_void_p),
("ptr_D", ctypes.c_void_p),
("batch_stride_A", ctypes.c_longlong),
("batch_stride_B", ctypes.c_longlong),
("batch_stride_C", ctypes.c_longlong),
("batch_stride_D", ctypes.c_longlong),
("stride_a", ctypes.c_longlong),
("stride_b", ctypes.c_longlong),
("stride_c", ctypes.c_longlong),
("stride_d", ctypes.c_longlong),
("lda", ctypes.c_longlong),
("ldb", ctypes.c_longlong),
("ldc", ctypes.c_longlong),
("ldd", ctypes.c_longlong),
("ptr_gather_A_indices", ctypes.c_void_p),
("ptr_gether_B_indices", ctypes.c_void_p),
("ptr_scatter_D_indices", ctypes.c_void_p)
]
return _GemmArguments, _EpilogueOutputOpParams
###########################################################################################
# GEMM Grouped
###########################################################################################
# include/cutlass/gemm/kernel/gemm_grouped.h
def get_gemm_grouped_arguments(element_compute_):
_EpilogueOutputOpParams = get_epilogue_output_op(element_compute_)
class _GEMMGroupedArguments(ctypes.Structure):
_fields_ = [
("problem_sizes", ctypes.c_void_p),
("problem_count", ctypes.c_int),
("threadblock_count", ctypes.c_int),
("output_op", _EpilogueOutputOpParams),
("ptr_A", ctypes.c_void_p),
("ptr_B", ctypes.c_void_p),
("ptr_C", ctypes.c_void_p),
("ptr_D", ctypes.c_void_p),
("lda", ctypes.c_void_p),
("ldb", ctypes.c_void_p),
("ldc", ctypes.c_void_p),
("ldd", ctypes.c_void_p),
("host_problem_sizes", ctypes.c_void_p)
]
return _GEMMGroupedArguments, _EpilogueOutputOpParams
############################################################################################
# Convolution2D
############################################################################################
# We use the arguments as the interface
# include/cutlass/conv/conv2d_problem_size.h
# 64B
class Conv2DProblemSize(ctypes.Structure):
_fields_ = [
("N", ctypes.c_int),
("H", ctypes.c_int),
("W", ctypes.c_int),
("C", ctypes.c_int),
("P", ctypes.c_int),
("Q", ctypes.c_int),
("K", ctypes.c_int),
("R", ctypes.c_int),
("S", ctypes.c_int),
("pad_h", ctypes.c_int),
("pad_w", ctypes.c_int),
("stride_h", ctypes.c_int),
("stride_w", ctypes.c_int),
("dilation_h", ctypes.c_int),
("dilation_w", ctypes.c_int),
("mode", ctypes.c_int), # kCrossCorrelation: 0, kConvolution: 1
("split_k_slices", ctypes.c_int),
("groups", ctypes.c_int)
]
def __init__(self, problem_size) -> None:
for field_name, _ in self._fields_:
setattr(self, field_name, getattr(problem_size, field_name))
# include/cutlass/layout/tensor.h
# 12B
class Layout4D(ctypes.Structure):
_fields_ = [
("stride", ctypes.c_int * 3)
]
def __init__(self, tensor_ref):
stride = tensor_ref.stride()
setattr(self, "stride", (stride.at(0), stride.at(1), stride.at(2)))
# TODO: Tensor 5-D takes ("stride", ctypes.c_int * 4)
# include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
# TensorRef is basically cutlass::TensorRef<Element, Layout>;
# include/cutlass/tensor_ref.h
# 24B
class TensorRef_(ctypes.Structure):
_fields_ = [
("ptr", ctypes.c_void_p),
("layout", Layout4D)
]
def __init__(self, tensor_ref):
setattr(self, "ptr", tensor_ref.data())
setattr(self, "layout", Layout4D(tensor_ref.layout()))
class TensorRef2D_(ctypes.Structure):
_fields_ = [
("ptr", ctypes.c_void_p),
("stride", ctypes.c_int)
]
# include/cutlass/conv/kernel/implicit_gemm_convolution.h
# split_k_mode: kNone: 0, kSerial: 1, kParallel: 2, kParallelSerial: 3, kInvalid: 4
def get_conv2d_arguments(element_compute_):
_EpilogueOutputOpParams = get_epilogue_output_op(element_compute_)
class _Conv2dArguments(ctypes.Structure):
_fields_ = [
("problem_size", Conv2DProblemSize), # 0
("ref_A", TensorRef_), # 72
("ref_B", TensorRef_), # 96
("ref_C", TensorRef_), # 120
("ref_D", TensorRef_), # 144
("output_op", _EpilogueOutputOpParams), # 168
("split_k_mode", ctypes.c_int) # 192
]
return _Conv2dArguments, _EpilogueOutputOpParams
############################################################################################
# Reduction
############################################################################################
def get_reduction_params(element_compute_):
_EpilogueOutputParams = get_epilogue_output_op(element_compute_)
class _ReductionParams(ctypes.Structure):
_fields_ = [
("problem_size", MatrixCoord_),
("partitions", ctypes.c_int),
("partition_stride", ctypes.c_longlong),
("workspace", TensorRef2D_),
("destination", TensorRef2D_),
("source", TensorRef2D_),
("output_op", _EpilogueOutputParams)
]
return _ReductionParams, _EpilogueOutputParams

View File

@ -0,0 +1,366 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from pycutlass import *
from pycutlass.library import SubstituteTemplate
import cutlass
from cuda import cuda
from cuda import nvrtc
import tempfile
import os
import ctypes
#
import json
import sqlite3
IncludeTemplate = r'''#include "${include}"
'''
#
class CompilationOptions:
'''
Compilation options.
'''
#
def __init__(self, architectures = [80], include_paths = []):
self.includes = []
self.include_paths = include_paths
self.flags = ['-std=c++11', '-default-device']
self.architectures = architectures
#
def get(self):
options = []
for flag in self.flags:
options.append(bytes(str.encode(flag)))
for incl in self.include_paths:
options.append(bytes(str.encode('--include-path=%s' % incl)))
arch_list = "-arch="
for idx, arch in enumerate(self.architectures):
if idx:
arch_list += ","
arch_list += "sm_%d" % arch
options.append(bytes(str.encode(arch_list)))
return options
def convertToBinaryData(filename):
with open(filename, 'rb') as file:
blobData = file.read()
return blobData
def CDLLBin(host_binary):
tempfile.tempdir = "./"
temp_so = tempfile.NamedTemporaryFile(prefix='host_func', suffix='.so', delete=True)
with open(temp_so.name, 'wb') as file:
file.write(host_binary)
host_lib = ctypes.CDLL(temp_so.name)
return host_lib
class ArtifactManager:
"""
Artifact manager
"""
def __init__(self) -> None:
try:
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_create_table_query = """CREATE TABLE compiled_operations(op_key TEXT NOT NULL UNIQUE, cubin BLOB NOT NULL, hostbin BLOB NOT NULL, op_name TEXT NOT NULL, op_attrs TEXT NOT NULL)"""
cursor.execute(sqlite_create_table_query)
connection.commit()
cursor.close()
except:
pass
self.compiled_cache_device = cutlass.CompileCache()
self.compiled_cache_host = cutlass.CompileCache()
def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
hostbin = convertToBinaryData(hostfile)
data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
cursor.execute(sqlite_insert_blob_query, data_tuple)
connection.commit()
cursor.close()
def load_operation(self, op_key):
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
# try:
cursor.execute(sqlite_fetch_blob_query, (op_key, ))
record = cursor.fetchall()
if len(record) == 0:
return False
for row in record:
key, cubin_image, host_binary, operation_name, op_attr = row
op_attr = json.loads(op_attr)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
err, kernel = cuda.cuModuleGetFunction(module, bytes(str.encode(operation_name)))
self.compiled_cache_device.insert(key, kernel)
compiled_host_fns = {}
host_lib = CDLLBin(host_binary)
func_name = operation_name + '_get_params'
func = getattr(host_lib, func_name)
func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
compiled_host_fns['get_args'] = func
func_name = operation_name + '_shared_memory_size'
func = getattr(host_lib, func_name)
compiled_host_fns['shared_memory_capacity'] = func()
for attr in op_attr:
if isinstance(attr, str):
func_name = operation_name + '_' + attr
func = getattr(host_lib, func_name)
compiled_host_fns[attr] = func
self.compiled_cache_host.insert(key, compiled_host_fns)
return True
def emit_compile_(self, operation_list, compilation_options):
"""
Compile a list of kernels and store them into database
"""
source_buffer_device = ""
source_buffer_host = ""
# 1. include
includes = []
for operation in operation_list:
for incl in operation.emitter.includes:
if incl not in includes:
includes.append(incl)
includes_host = [
"builtin_types.h", "device_launch_parameters.h", "stddef.h"] + includes
for incl in includes:
source_buffer_device += SubstituteTemplate(IncludeTemplate, {'include': incl})
for incl in includes_host:
if "/device/" not in incl:
source_buffer_host += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
# 2. Operations
for operation in operation_list:
source_buffer_device += operation.emit()
source_buffer_host += operation.emit()
values = {
'operation_name': operation.name(),
'operation_suffix': operation.emitter.operation_suffix
}
source_buffer_device += SubstituteTemplate(operation.KernelTemplate, values)
source_buffer_host += SubstituteTemplate(operation.HostTemplate, values)
# 3. compile
err, program = nvrtc.nvrtcCreateProgram(
str.encode(source_buffer_device),
bytes(str.encode("module.cu")),
0, [], [])
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
# Compile program
options = compilation_options.get()
err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
error_string = 'NVRTC Error: {}\n'.format(err)
# Get log from compilation
err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
log = b' ' * logSize
err, = nvrtc.nvrtcGetProgramLog(program, log)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
raise RuntimeError(error_string + log.decode() + source_buffer_device)
# Get data from compilation
err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
cubin_image = b' ' * dataSize
err, = nvrtc.nvrtcGetCUBIN(program, cubin_image)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
# compile the host code
options = compilation_options.get()
cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
for opt in options:
opt = opt.decode("utf-8")
if opt not in ['-default-device', '-std=c++11', '-arch=sm_80']:
if '--include-path=' in opt:
cmd += " " + opt.replace('--include-path=', '-I')
else:
cmd += " "+ opt
tempfile.tempdir = "./"
temp = tempfile.NamedTemporaryFile(prefix='host_func', suffix='.so', delete=True)
cmd += ' - -shared -o %s' % temp.name
os.system(cmd)
host_lib = ctypes.CDLL(temp.name)
return cubin_image, host_lib, temp
def add_module(self, operations, compile_options=None):
"""
Insert a new compiled device module
"""
if compile_options is None:
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
architectures = []
for operation in operations:
if hasattr(operation, "tile_description"):
cc = operation.tile_description.minimum_compute_capability
if cc not in architectures:
architectures.append(cc)
include_paths = [
cuda_install_path + '/include',
cutlass_path + '/include',
cutlass_path + '/tools/util/include',
]
compile_options = CompilationOptions(architectures, include_paths)
# save the cubin
operation_key = []
operation_list = []
for operation in operations:
# step 1: get kernel string as key
key = operation.rt_module.emit() + operation.procedural_name()
# step 1: check if the operation is in cache
compiled_kernel = self.compiled_cache_device.at(key)
if compiled_kernel is None:
hit = self.load_operation(key)
if hit:
compiled_kernel = self.compiled_cache_device.at(key)
assert compiled_kernel is not None
if compiled_kernel is not None:
operation.rt_module.kernel = compiled_kernel
compiled_host_fns = self.compiled_cache_host.at(key)
assert compiled_host_fns is not None
for key in compiled_host_fns.keys():
setattr(operation.rt_module, key, compiled_host_fns[key])
operation.rt_module.initialize()
else:
operation_list.append(operation.rt_module)
operation_key.append(key)
if len(operation_list) > 0:
cubin_image, host_lib, host_file = self.emit_compile_(operation_list, compile_options)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
operation_name = []
operation_attr = []
for operation, key in zip(operation_list, operation_key):
# get device kernels
err, operation.kernel = cuda.cuModuleGetFunction(
module,
bytes(str.encode(operation.name()))
)
operation_name.append(operation.name())
self.compiled_cache_device.insert(key, operation.kernel)
# get host functions
compiled_host_fns = {}
op_attr = []
# get param size
func_name = operation.name() + '_get_param_size'
func = getattr(host_lib, func_name)
param_size = func()
func_name = operation.name() + '_get_params'
func = getattr(host_lib, func_name)
func.argtype = operation.argtype
func.restype = ctypes.POINTER(ctypes.c_char * param_size)
setattr(operation, 'get_args', func)
compiled_host_fns['get_args'] = func
# set shared memory size
func_name = operation.name() + '_shared_memory_size'
func = getattr(host_lib, func_name)
setattr(operation, 'shared_memory_capacity', func())
compiled_host_fns['shared_memory_capacity'] = func()
# set the maximum dynamic shared size
operation.initialize()
# get extra functions
op_attr.append(param_size)
if hasattr(operation, "extra_funcs"):
for suffix in operation.extra_funcs:
func_name = operation.name() + '_' + suffix
func = getattr(host_lib, func_name)
setattr(operation, suffix, func)
compiled_host_fns[suffix] = func
op_attr.append(suffix)
operation_attr.append(op_attr)
self.compiled_cache_host.insert(key, compiled_host_fns)
for key, operation_name, operation_attr in zip(operation_key, operation_name, operation_attr):
self.insert_operation(key, cubin_image, host_file.name, operation_name, operation_attr)
artifact_manager = ArtifactManager()

View File

@ -0,0 +1,430 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from pycutlass import *
from pycutlass.library import SubstituteTemplate
import cutlass
from cuda import cuda
from cuda import nvrtc
import tempfile
import os
import ctypes
#
import json
import sqlite3
IncludeTemplate = r'''#include "${include}"
'''
#
class CompilationOptions:
'''
Compilation options.
'''
#
def __init__(self, flags, architectures=[80], include_paths=[]):
self.includes = []
self.include_paths = include_paths
self.flags = flags
self.architectures = architectures
def get_str(self):
options = ""
for flag in self.flags:
options += " " + flag
for incl in self.include_paths:
options += ' --include-path=%s' % incl
arch_list = "-arch="
for idx, arch in enumerate(self.architectures):
if idx:
arch_list += ","
arch_list += "sm_%d" % arch
options += " " + arch_list
return options
#
def get(self):
options = []
for flag in self.flags:
options.append(bytes(str.encode(flag)))
for incl in self.include_paths:
options.append(bytes(str.encode('--include-path=%s' % incl)))
arch_list = "-arch="
for idx, arch in enumerate(self.architectures):
if idx:
arch_list += ","
arch_list += "sm_%d" % arch
options.append(bytes(str.encode(arch_list)))
return options
def convertToBinaryData(filename):
with open(filename, 'rb') as file:
blobData = file.read()
return blobData
def CDLLBin(host_binary):
tempfile.tempdir = "./"
temp_so = tempfile.NamedTemporaryFile(
prefix='host_func', suffix='.so', delete=True)
with open(temp_so.name, 'wb') as file:
file.write(host_binary)
host_lib = ctypes.CDLL(temp_so.name)
return host_lib
class ArtifactManager:
"""
Artifact manager
"""
def __init__(self) -> None:
try:
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_create_table_query = """CREATE TABLE compiled_operations(op_key TEXT NOT NULL UNIQUE, cubin BLOB NOT NULL, hostbin BLOB NOT NULL, op_name TEXT NOT NULL, op_attrs TEXT NOT NULL)"""
cursor.execute(sqlite_create_table_query)
connection.commit()
cursor.close()
except:
pass
self.backend = "nvrtc"
self.default_compile_options = [
'-std=c++11', '-default-device',
]
self.compiled_cache_device = cutlass.CompileCache()
self.compiled_cache_host = cutlass.CompileCache()
def nvcc(self):
self.backend = "nvcc"
self.default_compile_options = [
'-std=c++11',
]
def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
hostbin = convertToBinaryData(hostfile)
data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
cursor.execute(sqlite_insert_blob_query, data_tuple)
connection.commit()
cursor.close()
def load_operation(self, op_key):
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
# try:
cursor.execute(sqlite_fetch_blob_query, (op_key, ))
record = cursor.fetchall()
if len(record) == 0:
return False
for row in record:
key, cubin_image, host_binary, operation_name, op_attr = row
op_attr = json.loads(op_attr)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
err, kernel = cuda.cuModuleGetFunction(
module, bytes(str.encode(operation_name)))
self.compiled_cache_device.insert(key, kernel)
compiled_host_fns = {}
host_lib = CDLLBin(host_binary)
func_name = operation_name + '_get_params'
func = getattr(host_lib, func_name)
func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
compiled_host_fns['get_args'] = func
func_name = operation_name + '_shared_memory_size'
func = getattr(host_lib, func_name)
compiled_host_fns['shared_memory_capacity'] = func()
for attr in op_attr:
if isinstance(attr, str):
func_name = operation_name + '_' + attr
func = getattr(host_lib, func_name)
compiled_host_fns[attr] = func
self.compiled_cache_host.insert(key, compiled_host_fns)
return True
def emit_compile_(self, operation_list, compilation_options):
"""
Compile a list of kernels and store them into database
"""
source_buffer_device = ""
source_buffer_host = ""
# 1. include
includes = []
for operation in operation_list:
for incl in operation.emitter.includes:
if incl not in includes:
includes.append(incl)
includes_host = [
"builtin_types.h", "device_launch_parameters.h", "stddef.h"] + includes
for incl in includes:
source_buffer_device += SubstituteTemplate(
IncludeTemplate, {'include': incl})
for incl in includes_host:
if "/device/" not in incl:
source_buffer_host += SubstituteTemplate(
IncludeTemplate, {'include': incl})
# 2. Operations
for operation in operation_list:
source_buffer_device += operation.emit()
source_buffer_host += operation.emit()
values = {
'operation_name': operation.name(),
'operation_suffix': operation.emitter.operation_suffix
}
source_buffer_device += SubstituteTemplate(
operation.KernelTemplate, values)
source_buffer_host += SubstituteTemplate(
operation.HostTemplate, values)
if self.backend == "nvrtc":
# 3. compile
err, program = nvrtc.nvrtcCreateProgram(
str.encode(source_buffer_device),
bytes(str.encode("module.cu")),
0, [], [])
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
# Compile program
options = compilation_options.get()
err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
error_string = 'NVRTC Error: {}\n'.format(err)
# Get log from compilation
err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
log = b' ' * logSize
err, = nvrtc.nvrtcGetProgramLog(program, log)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
raise RuntimeError(
error_string + log.decode() + source_buffer_device)
# Get data from compilation
err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
cubin_image = b' ' * dataSize
err, = nvrtc.nvrtcGetCUBIN(program, cubin_image)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
else: # with nvcc backend
# emit code
tempfile.tempdir = "./"
temp_cu = tempfile.NamedTemporaryFile(
prefix='kernel', suffix='.cu', delete=True)
temp_cubin = tempfile.NamedTemporaryFile(
prefix='kernel', suffix='.cubin', delete=True)
with open(temp_cu.name, 'w') as file:
file.write(source_buffer_device)
# compile with nvcc
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
values = {
"cuda_install_path": cuda_install_path,
"options": compilation_options.get_str(),
"srcfile": temp_cu.name,
"tarfile": temp_cubin.name
}
cmd = SubstituteTemplate(cmd_template, values)
os.system(cmd)
# load the cubin image
with open(temp_cubin.name, 'rb') as file:
cubin_image = file.read()
# compile the host code
options = compilation_options.get()
cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
for opt in options:
opt = opt.decode("utf-8")
if opt not in ['-default-device', '-std=c++11', '-arch=sm_80', '-Xcicc', '-Xllc']:
if '--include-path=' in opt:
cmd += " " + opt.replace('--include-path=', '-I')
else:
cmd += " " + opt
tempfile.tempdir = "./"
temp = tempfile.NamedTemporaryFile(
prefix='host_func', suffix='.so', delete=True)
cmd += ' - -shared -o %s' % temp.name
os.system(cmd)
host_lib = ctypes.CDLL(temp.name)
return cubin_image, host_lib, temp
def add_module(self, operations, compile_options=None):
"""
Insert a new compiled device module
"""
if compile_options is None:
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
architectures = []
for operation in operations:
if hasattr(operation, "tile_description"):
cc = operation.tile_description.minimum_compute_capability
if cc not in architectures:
architectures.append(cc)
include_paths = [
cuda_install_path + '/include',
cutlass_path + '/include',
cutlass_path + '/tools/util/include',
]
compile_options = CompilationOptions(
self.default_compile_options, architectures, include_paths)
# save the cubin
operation_key = []
operation_list = []
for operation in operations:
# step 1: get kernel string as key
key = operation.rt_module.emit() + operation.procedural_name() + self.backend
# step 1: check if the operation is in cache
compiled_kernel = self.compiled_cache_device.at(key)
if compiled_kernel is None:
hit = self.load_operation(key)
if hit:
compiled_kernel = self.compiled_cache_device.at(key)
assert compiled_kernel is not None
if compiled_kernel is not None:
operation.rt_module.kernel = compiled_kernel
compiled_host_fns = self.compiled_cache_host.at(key)
assert compiled_host_fns is not None
for key in compiled_host_fns.keys():
setattr(operation.rt_module, key, compiled_host_fns[key])
operation.rt_module.initialize()
else:
operation_list.append(operation.rt_module)
operation_key.append(key)
if len(operation_list) > 0:
cubin_image, host_lib, host_file = self.emit_compile_(
operation_list, compile_options)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
operation_name = []
operation_attr = []
for operation, key in zip(operation_list, operation_key):
# get device kernels
err, operation.kernel = cuda.cuModuleGetFunction(
module,
bytes(str.encode(operation.name()))
)
operation_name.append(operation.name())
self.compiled_cache_device.insert(key, operation.kernel)
# get host functions
compiled_host_fns = {}
op_attr = []
# get param size
func_name = operation.name() + '_get_param_size'
func = getattr(host_lib, func_name)
param_size = func()
func_name = operation.name() + '_get_params'
func = getattr(host_lib, func_name)
func.argtype = operation.argtype
func.restype = ctypes.POINTER(ctypes.c_char * param_size)
setattr(operation, 'get_args', func)
compiled_host_fns['get_args'] = func
# set shared memory size
func_name = operation.name() + '_shared_memory_size'
func = getattr(host_lib, func_name)
setattr(operation, 'shared_memory_capacity', func())
compiled_host_fns['shared_memory_capacity'] = func()
# set the maximum dynamic shared size
operation.initialize()
# get extra functions
op_attr.append(param_size)
if hasattr(operation, "extra_funcs"):
for suffix in operation.extra_funcs:
func_name = operation.name() + '_' + suffix
func = getattr(host_lib, func_name)
setattr(operation, suffix, func)
compiled_host_fns[suffix] = func
op_attr.append(suffix)
operation_attr.append(op_attr)
self.compiled_cache_host.insert(key, compiled_host_fns)
for key, operation_name, operation_attr in zip(operation_key, operation_name, operation_attr):
self.insert_operation(
key, cubin_image, host_file.name, operation_name, operation_attr)

View File

@ -0,0 +1,645 @@
################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from typeguard import typechecked
from cuda import cuda
from typing import Union
import numpy as np
from typeguard import typechecked
from pycutlass import *
# @typechecked
class Conv2dArguments(ArgumentBase):
"""
Argument wrapper for Conv2d. It encodes problem information and
user-provide tensors into the kernel's argument.
:param operation: the Conv2d operation to take the argument
:type operation: :class:`pycutlass.Conv2dOperation`
:param A: tensor A
:type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param B: tensor B
:type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param C: tensor C
:type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param D: tensor D
:type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param split_k_mode: conv2d split K mode, defaults to
cutlass.conv.SplitKMode.Serial
:type split_k_mode: cutlass.conv.SplitKMode, optional
:param output_op: output operator, optional
:type output_op: :class:`pycutlass.LinearCombinationFunctorArguments`
"""
def __init__(self, operation: 'Conv2dOperation',
problem_size: 'cutlass.conv.Conv2dProblemSize',
A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
split_k_mode: 'cutlass.conv.SplitKMode'
= cutlass.conv.SplitKMode.Serial, **kwargs) -> None:
#: convolution kind
self.conv_kind: cutlass.conv.Operator = operation.conv_kind
self.layout_A: cutlass.layout = operation.A.layout
self.layout_B: cutlass.layout = operation.B.layout
self.layout_C: cutlass.layout = operation.C.layout
self.element_A = operation.A.element
self.element_B = operation.B.element
self.element_C = operation.C.element
if self.layout_C == cutlass.TensorNC32HW32:
B = self.reorder_tensor_B(B, problem_size)
super().__init__(A, B, C, D, **kwargs)
# preprocessing output ops
if "output_op" in kwargs.keys() and \
split_k_mode != cutlass.conv.SplitKMode.Parallel:
self.alpha = kwargs["output_op"].alpha
self.beta = kwargs["output_op"].beta
else:
self.alpha = 1.0
self.beta = 0.0
self.element_compute = operation.element_epilogue
if "split_k_slices" in kwargs.keys():
self.split_k_mode = split_k_mode
self.split_k_slices = kwargs["split_k_slices"]
else:
self.split_k_mode = cutlass.conv.SplitKMode.Serial
self.split_k_slices = 1
#: problem_size
self.problem_size: cutlass.conv.Conv2dProblemSize = problem_size
self.problem_size.split_k_slices = self.split_k_slices
self.operation = operation
#
# initialize the argument
#
self.initialize()
# @typechecked
def reorder_tensor_B(self, tensor_B: 'np.ndarray',
problem_size: 'cutlass.conv.Conv2dProblemSize'):
"""
Reorder tensor_B for interleaved layout
:param tensor_B: input tensor B
:type tensor_B: numpy.ndarray
:param problem_size: Conv2d problem size
:type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
:return: reordered tensor B
:rtype: numpy.ndarray
"""
reordered_tensor_B = np.empty_like(tensor_B)
tensor_ref_B = self.get_tensor_ref(
tensor_B, self.element_B, self.layout_B, problem_size, "b")
reordered_tensor_ref_B = self.get_tensor_ref(
reordered_tensor_B, self.element_B,
self.layout_B, problem_size, "b")
cutlass.conv.host.reorder_convK(
reordered_tensor_ref_B, tensor_ref_B, self.conv_kind, problem_size)
return reordered_tensor_B
def get_tensor_ref(
self, tensor, dtype, tensor_layout, problem_size, operand):
if operand == "a":
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(
self.conv_kind, problem_size)
elif operand == "b":
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(
self.conv_kind, problem_size)
elif operand in ["c", "d"]:
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
self.conv_kind, problem_size)
else:
raise ValueError("unknown operand: " + operand)
layout = tensor_layout.packed(tensor_coord)
return TensorRef(tensor, dtype, layout).tensor_ref
def get_arguments(self, semaphore):
ref_A = TensorRef_(self.get_tensor_ref(
self.ptr_A, self.element_A, self.layout_A, self.problem_size, "a"))
ref_B = TensorRef_(self.get_tensor_ref(
self.ptr_B, self.element_B, self.layout_B, self.problem_size, "b"))
ref_C = TensorRef_(self.get_tensor_ref(
self.ptr_C, self.element_C, self.layout_C, self.problem_size, "c"))
ref_D = TensorRef_(self.get_tensor_ref(
self.ptr_D, self.element_C, self.layout_C, self.problem_size, "d"))
if self.element_compute == cutlass.float16:
alpha = cutlass.float16(self.alpha).storage
beta = cutlass.float16(self.beta).storage
elif self.element_compute == cutlass.int32:
alpha = int(self.alpha)
beta = int(self.beta)
else:
alpha = self.alpha
beta = self.beta
argument_type, epilogue_type = get_conv2d_arguments(
self.operation.element_epilogue)
output_op = epilogue_type(alpha, beta, 0, 0)
self.c_arguments = argument_type(
Conv2DProblemSize(self.problem_size),
ref_A, ref_B, ref_C, ref_D, output_op, self.split_k_mode
)
self.semaphore = semaphore
def initialize(self):
"""
Initialize the kernel arguments handling following stuffs
1. get kernel launch configuration including grid, cta size,
and dynamic shared memory capacity
2. allocate and initialize device workspace
3. get kernel params as bytearray for NVRTC input
"""
# get launch configuration
self.launch_config = self.operation.rt_module.plan(self)
# allocate and initialize device workspace
device_workspace_size = \
self.operation.rt_module.get_device_workspace_size(self)
if device_workspace_size > 0:
self.workspace_buffer = device_mem_alloc(device_workspace_size)
workspace_ptr = self.workspace_buffer.ptr
err, = cuda.cuMemsetD32(
workspace_ptr, 0, device_workspace_size // 4)
else:
workspace_ptr = None
# get kernel params as bytearray
semaphore = 0
if workspace_ptr is not None and \
self.split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.ptr_D = workspace_ptr
elif workspace_ptr is not None and \
self.split_k_mode == cutlass.conv.SplitKMode.Serial:
semaphore = workspace_ptr
self.get_arguments(semaphore)
params_ = self.operation.rt_module.get_args(ctypes.byref(
self.c_arguments), ctypes.c_void_p(int(self.semaphore)))
self.host_workspace = bytearray(params_.contents)
self.device_workspace = None
def sync(self):
"""
Synchronize the arguments. If the input tensor is in host,
copy it from device to host.
"""
return super().sync()
# @typechecked
class Conv2dRT(ExecutableOperation):
"""
Conv2dRT manages the CUTLASS runtime components
"""
KernelTemplate = r'''
extern "C"
__global__ void
${operation_name}(${operation_name}${operation_suffix}::Params params) {
// Dynamic shared memory base pointer
extern __shared__ int SharedStorageBase[];
// Declare pointer to dynamic shared memory.
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
${operation_name}${operation_suffix} op;
op(params, *shared_storage);
}
'''
HostTemplate = r'''
extern "C" {
// Get the size of params in bytes
int ${operation_name}_get_param_size(){
return sizeof(${operation_name}${operation_suffix}::Params);
}
// Get the size of dynamic shared memory in bytes
int ${operation_name}_shared_memory_size() {
return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
}
// Get the params as byte array
char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Arguments* arguments, int *semaphore=nullptr){
typename ${operation_name}${operation_suffix}::Params* params;
params = new ${operation_name}${operation_suffix}::Params(*arguments, semaphore);
char *bytes = ((char*)(params));
char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
output[i] = bytes[i];
return output;
}
}
'''
def __init__(self, operation: 'Conv2dOperation'):
super().__init__(operation)
self.argtype = [ctypes.POINTER(get_conv2d_arguments(
operation.element_epilogue)[0]), ctypes.c_void_p]
self.conv_kind = operation.conv_kind
self.operation: Conv2dOperation = operation
self.emitter = EmitConv2dInstance('_type')
self.threads: int = operation.tile_description.num_threads
self.swizzle_functor = operation.swizzling_functor
def emit(self):
return self.emitter.emit(self.operation)
# @typechecked
def get_device_workspace_size(self, arguments: Conv2dArguments):
workspace_bytes = 0
launch_config = arguments.launch_config
self.conv_kind = self.operation.conv_kind
if arguments.split_k_mode == cutlass.conv.SplitKMode.Parallel:
problem_size = arguments.problem_size
workspace_bytes = DataTypeSize[self.operation.C.element] \
* launch_config.grid[2] * cutlass.conv.implicit_gemm_tensor_c_size(
self.conv_kind, problem_size
) // 8
elif arguments.split_k_mode == cutlass.conv.SplitKMode.Serial and \
arguments.split_k_slices > 1:
workspace_bytes = launch_config.grid[0] * launch_config.grid[1] * 4
return workspace_bytes
# @typechecked
def plan(self, arguments: Conv2dArguments):
tile_size = cutlass.gemm.GemmCoord(
self.operation.tile_description.threadblock_shape[0],
self.operation.tile_description.threadblock_shape[1],
self.operation.tile_description.threadblock_shape[2]
)
grid = self.swizzle_functor.get_grid_shape(
self.swizzle_functor.get_tiled_shape(
self.conv_kind, arguments.problem_size,
tile_size, arguments.split_k_slices
)
)
return LaunchConfiguration(
[grid.x, grid.y, grid.z], [self.threads, 1, 1],
self.shared_memory_capacity)
def initialize(self):
err, = cuda.cuFuncSetAttribute(
self.kernel,
attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
value=self.shared_memory_capacity)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
#
class Conv2dOperation:
"""
CUTLASS Conv2d operation description.
:param conv_kind: convolution operator
:type conv_kind: :class:`cutlass.conv.Operator`
:param iterator_algorithm: Selects among several implementation
variants trading off performance with simplicity
:type iterator_algorithm: :class:`cutlass.conv.IteratorAlgorithm`
:param arch: GPU compute capability (sm_xx)
:type arch: int
:param tile_description: tile description
:type tile_description: :class:`pycutlass.TileDescription`
:param A: tensor A description
:type A: :class:`pycutlass.TensorDescription`
:param B: tensor B description
:type B: :class:`pycutlass.TensorDescription`
:param C: tensor C description
:type C: :class:`pycutlass.TensorDescription`
:param D: tensor D description
:type D: :class:`pycutlass.TensorDescription`
:param element_epilogue: element type for computation in epilogue \
:type element_epilogue: cutlass.int8 | cutlass.int32 | cutlass.float16 | \
cutlass.bfloat16 | cutlass.float32 | cutlass.float64
:param stride_support: distinguish among partial specializations that \
accelerate certain problems where convolution stride is unit \
:type stride_support: :class:`cutlass.conv.StrideSupport`
:param epilogue_functor: convolution epilogue functor
:type epilogue_functor: :class:`EpilogueFunctor`
:param swizzling_functor: threadblock swizzling functor
"""
#
def __init__(self,
conv_kind: cutlass.conv.Operator,
iterator_algorithm: cutlass.conv.IteratorAlgorithm,
arch: int, tile_description: TileDescription,
A: TensorDescription, B: TensorDescription, C: TensorDescription,
element_epilogue: Union[cutlass.int8, cutlass.int32, cutlass.float16,
cutlass.bfloat16, cutlass.float32, cutlass.float64],
stride_support, epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1):
self.operation_kind: OperationKind = OperationKind.Conv2d
self.arch: int = arch
self.tile_description: TileDescription = tile_description
self.conv_kind = conv_kind
self.A: TensorDescription = A
self.B: TensorDescription = B
self.C: TensorDescription = C
self.element_epilogue = element_epilogue
self.epilogue_functor = epilogue_functor
self.iterator_algorithm = iterator_algorithm
self.stride_support = stride_support
self.swizzling_functor = swizzling_functor()
self.rt_module: Conv2dRT = Conv2dRT(self)
def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
"""
Launch the cuda kernel with input arguments
:param arguments: conv2d arguments
:type arguments: :class:`pycutlass.Conv2dArguments`
"""
# launch the kernel
err = self.rt_module.run(
arguments.host_workspace,
arguments.device_workspace,
arguments.launch_config)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('CUDA Error %s' % str(err))
return err
#
# Get function name
#
def procedural_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
return self.configuration_name()
#
def configuration_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
threadblock = "%dx%d_%dx%d" % (
self.tile_description.threadblock_shape[0],
self.tile_description.threadblock_shape[1],
self.tile_description.threadblock_shape[2],
self.tile_description.stages
)
if self.stride_support == StrideSupport.Unity:
configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
else:
configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
return SubstituteTemplate(
configuration_name,
{
'opcode_class': opcode_class_name,
'extended_name': self.extended_name(),
'threadblock': threadblock,
'layout': self.layout_name(),
'alignment': "%d" % self.A.alignment,
}
)
#
def extended_name(self):
''' Append data types if they differ from compute type. '''
if self.C.element != self.tile_description.math_instruction.element_accumulator and \
self.A.element != self.tile_description.math_instruction.element_accumulator:
extended_name = "${element_c}_${core_name}_${element_a}"
elif self.C.element == self.tile_description.math_instruction.element_accumulator and \
self.A.element != self.tile_description.math_instruction.element_accumulator:
extended_name = "${core_name}_${element_a}"
else:
extended_name = "${core_name}"
extended_name = SubstituteTemplate(extended_name, {
'element_a': DataTypeNames[self.A.element],
'element_c': DataTypeNames[self.C.element],
'core_name': self.core_name()
})
return extended_name
#
def layout_name(self):
return "%s" % (ShortLayoutTypeNames[self.A.layout])
#
def core_name(self):
''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
intermediate_type = ''
if self.tile_description.math_instruction.opcode_class == cutlass.OpClass.TensorOp:
inst_shape = "%d%d%d" % tuple(
self.tile_description.math_instruction.instruction_shape)
if self.tile_description.math_instruction.element_a != self.A.element and \
self.tile_description.math_instruction.element_a != self.accumulator_type():
intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
else:
inst_shape = ''
return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()],
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
#
def is_complex(self):
complex_operators = [
MathOperation.multiply_add_complex,
MathOperation.multiply_add_complex_gaussian
]
return self.tile_description.math_instruction.math_operation in complex_operators
#
def accumulator_type(self):
accum = self.tile_description.math_instruction.element_accumulator
if self.is_complex():
return get_complex_from_real(accum)
return accum
###################################################################################################
#
# Emits single instances of a CUTLASS device-wide operator
#
###################################################################################################
class EmitConv2dInstance:
def __init__(self, operation_suffix=''):
self.operation_suffix = operation_suffix
self.includes = [
"cutlass/cutlass.h",
"cutlass/conv/kernel/default_conv2d_fprop.h",
"cutlass/conv/kernel/default_conv2d_dgrad.h",
"cutlass/conv/kernel/default_conv2d_wgrad.h"
]
self.template = """
// Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
using ${operation_name}_base =
typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
${element_a},
${layout_a},
${element_b},
${layout_b},
${element_c},
${layout_c},
${element_accumulator},
${opcode_class},
${arch},
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
${epilogue_functor}<
${element_c},
${epilogue_vector_length},
${element_accumulator},
${element_epilogue}
>,
${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
${stages},
${math_operator},
${iterator_algorithm},
${stride_support},
${align_a},
${align_b}
>::Kernel;
struct ${operation_name}${operation_suffix}:
public ${operation_name}_base { };
"""
def emit(self, operation):
warp_shape = [int(operation.tile_description.threadblock_shape[idx] /
operation.tile_description.warp_count[idx]) for idx in range(3)]
epilogue_vector_length = int(min(
operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
values = {
'operation_name': operation.procedural_name(),
'operation_suffix': self.operation_suffix,
'conv_kind': ConvKindTag[operation.conv_kind],
'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
'element_a': DataTypeTag[operation.A.element],
'layout_a': LayoutTag[operation.A.layout],
'element_b': DataTypeTag[operation.B.element],
'layout_b': LayoutTag[operation.B.layout],
'element_c': DataTypeTag[operation.C.element],
'layout_c': LayoutTag[operation.C.layout],
'element_accumulator': DataTypeTag[operation.accumulator_type()],
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
'arch': "cutlass::arch::Sm%d" % operation.arch,
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
'warp_shape_m': str(warp_shape[0]),
'warp_shape_n': str(warp_shape[1]),
'warp_shape_k': str(warp_shape[2]),
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
'epilogue_vector_length': str(epilogue_vector_length),
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
'swizzling_functor': operation.swizzling_functor.tag(),
'stages': str(operation.tile_description.stages),
'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
'stride_support': StrideSupportTag[operation.stride_support],
'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else
MathOperationTag[operation.tile_description.math_instruction.math_operation],
'align_a': str(operation.A.alignment),
'align_b': str(operation.B.alignment),
}
return SubstituteTemplate(self.template, values)

View File

@ -0,0 +1,138 @@
################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
import struct
def MaxAlignment(fmt):
align = 1
for x in fmt:
align = max(align, struct.calcsize(x))
return align
def AlignedOffset(offset, align):
remainder = (offset % align)
if remainder:
offset += (align - remainder)
return offset
#################################################################################################
#
# Functors
#
#################################################################################################
#
class Functor:
def __init__(self):
self.decl = ''
self.definition = ''
self.fmt = ''
self.identifier = ''
#
def emit_declaration(self):
return self.decl
#
def emit_definition(self):
return self.definition
#
def size(self):
'''
Size of the packed Params structure
'''
return struct.calcsize(self.fmt)
#
def alignment(self):
return MaxAlignment(self.fmt)
#
def initialize(self, host_workspace, offset, arguments):
return offset + self.size()
#################################################################################################
#
class LinearCombinationFunctorArguments:
def __init__(self, alpha=1.0, beta=0.0):
self.alpha = alpha
self.beta = beta
self.alpha_ptr = 0
self.beta_ptr = 0
#
class LinearCombinationFunctor(Functor):
def __init__(self):
super().__init__()
self.decl = """
cutlass::epilogue::thread::LinearCombination<
float,
1,
float,
float
>"""
self.identifier = 'linear_combination'
self.fmt = "ffPP"
#
def size(self):
'''
Size of the packed Params structure
'''
return struct.calcsize(self.fmt)
#
def alignment(self):
return MaxAlignment(self.fmt)
#
def initialize(self, host_workspace, offset, arguments):
offset = AlignedOffset(offset, self.alignment())
struct.pack_into(
self.fmt,
host_workspace, offset,
arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
return offset + self.size()

View File

@ -0,0 +1,104 @@
################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
import numpy as np
from cuda import cuda
from pycutlass.memory_manager import *
from typing import TYPE_CHECKING
try:
import torch
torch_available = True
except ImportError:
torch_available = False
if TYPE_CHECKING:
import torch
try:
import cupy as cp
cupy_available = True
except ImportError:
cupy_available = False
if TYPE_CHECKING:
import cupy as cp
class NumpyFrontend:
"""
Frontend node for numpy
"""
@staticmethod
def argument(np_tensor: 'np.ndarray', is_output: 'bool') -> cuda.CUdeviceptr:
"""Convert the input numpy tensor to CUDA device pointer
:param np_tensor: input numpy nd array
:param is_output: whether the tensor is output
:return: CUDA device pointer
"""
# copy the data to device
if is_output:
return device_mem_alloc(np_tensor.size * np_tensor.itemsize)
else:
return todevice(np_tensor)
class TorchFrontend:
"""
Frontend node for torch
"""
@staticmethod
def argument(torch_tensor: 'torch.Tensor') -> cuda.CUdeviceptr:
"""Convert the input torch tensor to CUDA device pointer
:param torch_tensor: input torch tensor
:param is_output: whether the tensor is output
:return: CUDA device pointer
"""
# check the device of torch_tensor
if not torch_tensor.is_cuda:
torch_tensor = torch_tensor.to("cuda")
return cuda.CUdeviceptr(torch_tensor.data_ptr())
class CupyFrontend:
"""
Frontend node for cupy
"""
@staticmethod
def argument(cupy_ndarray: 'cp.ndarray'):
return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,790 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import re
###################################################################################################
import enum
import cutlass
# The following block implements enum.auto() for Python 3.5 variants that don't include it such
# as the default 3.5.2 on Ubuntu 16.04.
#
# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
try:
from enum import auto as enum_auto
except ImportError:
__cutlass_library_auto_enum = 0
def enum_auto() -> int:
global __cutlass_library_auto_enum
i = __cutlass_library_auto_enum
__cutlass_library_auto_enum += 1
return i
###################################################################################################
#
class GeneratorTarget(enum.Enum):
Library = enum_auto()
#
GeneratorTargetNames = {
GeneratorTarget.Library: 'library',
}
#
###################################################################################################
#
ShortDataTypeNames = {
cutlass.int32: 'i',
cutlass.float16: 'h',
cutlass.float32: 's',
cutlass.float64: 'd',
cutlass.dtype.cf32: 'c',
cutlass.dtype.cf64: 'z',
}
#
DataTypeNames = {
cutlass.dtype.b1: "b1",
cutlass.dtype.u4: "u4",
cutlass.dtype.u8: "u8",
cutlass.dtype.u16: "u16",
cutlass.dtype.u32: "u32",
cutlass.dtype.u64: "u64",
cutlass.dtype.s4: "s4",
cutlass.int8: "s8",
cutlass.dtype.s16: "s16",
cutlass.int32: "s32",
cutlass.dtype.s64: "s64",
cutlass.float16: "f16",
cutlass.bfloat16: "bf16",
cutlass.float32: "f32",
cutlass.tfloat32: "tf32",
cutlass.float64: "f64",
cutlass.dtype.cf16: "cf16",
cutlass.dtype.cbf16: "cbf16",
cutlass.dtype.cf32: "cf32",
cutlass.dtype.ctf32: "ctf32",
cutlass.dtype.cf64: "cf64",
cutlass.dtype.cu4: "cu4",
cutlass.dtype.cu8: "cu8",
cutlass.dtype.cu16: "cu16",
cutlass.dtype.cu32: "cu32",
cutlass.dtype.cu64: "cu64",
cutlass.dtype.cs4: "cs4",
cutlass.dtype.cs8: "cs8",
cutlass.dtype.cs16: "cs16",
cutlass.dtype.cs32: "cs32",
cutlass.dtype.cs64: "cs64",
}
DataTypeTag = {
cutlass.dtype.b1: "cutlass::uint1b_t",
cutlass.dtype.u2: "cutlass::uint2b_t",
cutlass.dtype.u4: "cutlass::uint4b_t",
cutlass.dtype.u8: "uint8_t",
cutlass.dtype.u16: "uint16_t",
cutlass.dtype.u32: "uint32_t",
cutlass.dtype.u64: "uint64_t",
cutlass.dtype.s2: "cutlass::int2b_t",
cutlass.dtype.s4: "cutlass::int4b_t",
cutlass.int8: "int8_t",
cutlass.dtype.s16: "int16_t",
cutlass.int32: "int32_t",
cutlass.dtype.s64: "int64_t",
cutlass.float16: "cutlass::half_t",
cutlass.bfloat16: "cutlass::bfloat16_t",
cutlass.float32: "float",
cutlass.tfloat32: "cutlass::tfloat32_t",
cutlass.float64: "double",
cutlass.dtype.cf16: "cutlass::complex<cutlass::half_t>",
cutlass.dtype.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
cutlass.dtype.cf32: "cutlass::complex<float>",
cutlass.dtype.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
cutlass.dtype.cf64: "cutlass::complex<double>",
cutlass.dtype.cu2: "cutlass::complex<cutlass::uint2b_t>",
cutlass.dtype.cu4: "cutlass::complex<cutlass::uint4b_t>",
cutlass.dtype.cu8: "cutlass::complex<cutlass::uint8_t>",
cutlass.dtype.cu16: "cutlass::complex<cutlass::uint16_t>",
cutlass.dtype.cu32: "cutlass::complex<cutlass::uint32_t>",
cutlass.dtype.cu64: "cutlass::complex<cutlass::uint64_t>",
cutlass.dtype.cs2: "cutlass::complex<cutlass::int2b_t>",
cutlass.dtype.cs4: "cutlass::complex<cutlass::int4b_t>",
cutlass.dtype.cs8: "cutlass::complex<cutlass::int8_t>",
cutlass.dtype.cs16: "cutlass::complex<cutlass::int16_t>",
cutlass.dtype.cs32: "cutlass::complex<cutlass::int32_t>",
cutlass.dtype.cs64: "cutlass::complex<cutlass::int64_t>",
}
DataTypeSize = {
cutlass.dtype.b1: 1,
cutlass.dtype.u4: 4,
cutlass.dtype.u8: 8,
cutlass.dtype.u16: 16,
cutlass.dtype.u32: 32,
cutlass.dtype.u64: 64,
cutlass.dtype.s4: 4,
cutlass.int8: 8,
cutlass.dtype.s16: 16,
cutlass.int32: 32,
cutlass.dtype.s64: 64,
cutlass.float16: 16,
cutlass.bfloat16: 16,
cutlass.float32: 32,
cutlass.tfloat32: 32,
cutlass.float64: 64,
cutlass.dtype.cf16: 32,
cutlass.dtype.cbf16: 32,
cutlass.dtype.cf32: 64,
cutlass.dtype.ctf32: 32,
cutlass.dtype.cf64: 128,
cutlass.dtype.cu4: 8,
cutlass.dtype.cu8: 16,
cutlass.dtype.cu16: 32,
cutlass.dtype.cu32: 64,
cutlass.dtype.cu64: 128,
cutlass.dtype.cs4: 8,
cutlass.dtype.cs8: 16,
cutlass.dtype.cs16: 32,
cutlass.dtype.cs32: 64,
cutlass.dtype.cs64: 128,
}
###################################################################################################
#
class BlasMode(enum.Enum):
symmetric = enum_auto()
hermitian = enum_auto()
#
BlasModeTag = {
BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
}
#
ComplexTransformTag = {
cutlass.complex_transform.none: 'cutlass::ComplexTransform::kNone',
cutlass.complex_transform.conj: 'cutlass::ComplexTransform::kConjugate',
}
#
RealComplexBijection = [
(cutlass.float16, cutlass.dtype.cf16),
(cutlass.float32, cutlass.dtype.cf32),
(cutlass.float64, cutlass.dtype.cf64),
]
#
def is_complex(data_type):
for r, c in RealComplexBijection:
if data_type == c:
return True
return False
#
def get_complex_from_real(real_type):
for r, c in RealComplexBijection:
if real_type == r:
return c
return cutlass.dtype.invalid
#
def get_real_from_complex(complex_type):
for r, c in RealComplexBijection:
if complex_type == c:
return r
return cutlass.dtype.invalid
#
class ComplexMultiplyOp(enum.Enum):
multiply_add = enum_auto()
gaussian = enum_auto()
###################################################################################################
#
class MathOperation(enum.Enum):
multiply_add = enum_auto()
multiply_add_saturate = enum_auto()
xor_popc = enum_auto()
multiply_add_fast_bf16 = enum_auto()
multiply_add_fast_f16 = enum_auto()
multiply_add_fast_f32 = enum_auto()
multiply_add_complex_fast_f32 = enum_auto()
multiply_add_complex = enum_auto()
multiply_add_complex_gaussian = enum_auto()
#
MathOperationNames = {
MathOperation.multiply_add: 'multiply_add',
MathOperation.multiply_add_saturate: 'multiply_add_saturate',
MathOperation.xor_popc: 'xor_popc',
MathOperation.multiply_add_fast_bf16: 'multiply_add_fast_bf16',
MathOperation.multiply_add_fast_f16: 'multiply_add_fast_f16',
MathOperation.multiply_add_fast_f32: 'multiply_add_fast_f32',
MathOperation.multiply_add_complex_fast_f32: 'multiply_add_complex_fast_f32',
MathOperation.multiply_add_complex: 'multiply_add_complex',
MathOperation.multiply_add_complex_gaussian: 'multiply_add_complex_gaussian',
}
#
MathOperationTag = {
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
}
###################################################################################################
#
LayoutTag = {
cutlass.ColumnMajor: 'cutlass::layout::ColumnMajor',
cutlass.RowMajor: 'cutlass::layout::RowMajor',
cutlass.layout.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
cutlass.layout.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
cutlass.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
cutlass.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
cutlass.layout.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
cutlass.layout.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
cutlass.TensorNHWC: 'cutlass::layout::TensorNHWC',
cutlass.layout.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
cutlass.layout.TensorNCHW: 'cutlass::layout::TensorNCHW',
cutlass.layout.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
cutlass.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
cutlass.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
cutlass.layout.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
cutlass.layout.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
}
#
TransposedLayout = {
cutlass.ColumnMajor: cutlass.RowMajor,
cutlass.RowMajor: cutlass.ColumnMajor,
cutlass.layout.ColumnMajorInterleaved2: cutlass.layout.RowMajorInterleaved2,
cutlass.layout.RowMajorInterleaved2: cutlass.layout.ColumnMajorInterleaved2,
cutlass.ColumnMajorInterleaved32: cutlass.RowMajorInterleaved32,
cutlass.RowMajorInterleaved32: cutlass.ColumnMajorInterleaved32,
cutlass.layout.ColumnMajorInterleaved64: cutlass.layout.RowMajorInterleaved64,
cutlass.layout.RowMajorInterleaved64: cutlass.layout.ColumnMajorInterleaved64,
cutlass.TensorNHWC: cutlass.TensorNHWC
}
#
ShortLayoutTypeNames = {
cutlass.ColumnMajor: 'n',
cutlass.layout.ColumnMajorInterleaved2: 'n2',
cutlass.ColumnMajorInterleaved32: 'n32',
cutlass.layout.ColumnMajorInterleaved64: 'n64',
cutlass.RowMajor: 't',
cutlass.layout.RowMajorInterleaved2: 't2',
cutlass.RowMajorInterleaved32: 't32',
cutlass.layout.RowMajorInterleaved64: 't64',
cutlass.TensorNHWC: 'nhwc',
cutlass.layout.TensorNDHWC: 'ndhwc',
cutlass.layout.TensorNCHW: 'nchw',
cutlass.layout.TensorNGHWC: 'nghwc',
cutlass.TensorNC32HW32: 'nc32hw32',
cutlass.layout.TensorNC64HW64: 'nc64hw64',
cutlass.TensorC32RSK32: 'c32rsk32',
cutlass.layout.TensorC64RSK64: 'c64rsk64'
}
#
ShortComplexLayoutNames = {
(cutlass.ColumnMajor, cutlass.complex_transform.none): 'n',
(cutlass.ColumnMajor, cutlass.complex_transform.conj): 'c',
(cutlass.RowMajor, cutlass.complex_transform.none): 't',
(cutlass.RowMajor, cutlass.complex_transform.conj): 'h'
}
###################################################################################################
#
class SideMode(enum.Enum):
Left = enum_auto()
Right = enum_auto()
#
SideModeTag = {
SideMode.Left: 'cutlass::SideMode::kLeft',
SideMode.Right: 'cutlass::SideMode::kRight'
}
#
ShortSideModeNames = {
SideMode.Left: 'ls',
SideMode.Right: 'rs'
}
###################################################################################################
#
class FillMode(enum.Enum):
Lower = enum_auto()
Upper = enum_auto()
#
FillModeTag = {
FillMode.Lower: 'cutlass::FillMode::kLower',
FillMode.Upper: 'cutlass::FillMode::kUpper'
}
#
ShortFillModeNames = {
FillMode.Lower: 'l',
FillMode.Upper: 'u'
}
###################################################################################################
#
class DiagType(enum.Enum):
NonUnit = enum_auto()
Unit = enum_auto()
#
DiagTypeTag = {
DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
DiagType.Unit: 'cutlass::DiagType::kUnit'
}
#
ShortDiagTypeNames = {
DiagType.NonUnit: 'nu',
DiagType.Unit: 'un'
}
###################################################################################################
OpcodeClassNames = {
cutlass.OpClass.Simt: 'simt',
cutlass.OpClass.TensorOp: 'tensorop',
cutlass.OpClass.WmmaTensorOp: 'wmma_tensorop',
cutlass.OpClass.SparseTensorOp: 'sptensorop'
}
OpcodeClassTag = {
cutlass.OpClass.Simt: 'cutlass::arch::OpClassSimt',
cutlass.OpClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
cutlass.OpClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
cutlass.OpClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp'
}
###################################################################################################
#
class OperationKind(enum.Enum):
Gemm = enum_auto()
RankK = enum_auto()
Rank2K = enum_auto()
Trmm = enum_auto()
Symm = enum_auto()
Conv2d = enum_auto()
Conv3d = enum_auto()
#
OperationKindNames = {
OperationKind.Gemm: 'gemm', OperationKind.RankK: 'rank_k', OperationKind.Rank2K: 'rank_2k', OperationKind.Trmm: 'trmm', OperationKind.Symm: 'symm', OperationKind.Conv2d: 'conv2d', OperationKind.Conv3d: 'conv3d'
}
#
ArchitectureNames = {
50: 'maxwell',
60: 'pascal',
61: 'pascal',
70: 'volta',
75: 'turing',
80: 'ampere',
}
#
SharedMemPerCC = {
70: 96, # 96KB of SMEM
72: 96, # 96KB of SMEM
75: 64, # 64KB of SMEM
80: 160, # 164KB of SMEM - 4KB reserved for the driver
86: 100, # 100KB of SMEM
87: 160, # 164KB of SMEM - 4KB reserved for the driver
}
###################################################################################################
#
def SubstituteTemplate(template, values):
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text
###################################################################################################
#
class GemmKind(enum.Enum):
Gemm = enum_auto()
Sparse = enum_auto()
Universal = enum_auto()
PlanarComplex = enum_auto()
PlanarComplexArray = enum_auto()
Grouped = enum_auto()
#
GemmKindNames = {
GemmKind.Gemm: "gemm",
GemmKind.Sparse: "spgemm",
GemmKind.Universal: "gemm",
GemmKind.PlanarComplex: "gemm_planar_complex",
GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
GemmKind.Grouped: "gemm_grouped"
}
#
class RankKKind(enum.Enum):
Universal = enum_auto()
#
RankKKindNames = {
RankKKind.Universal: "rank_k"
}
#
class TrmmKind(enum.Enum):
Universal = enum_auto()
#
TrmmKindNames = {
TrmmKind.Universal: "trmm"
}
#
class SymmKind(enum.Enum):
Universal = enum_auto()
#
SymmKindNames = {
SymmKind.Universal: "symm"
}
#
class EpilogueFunctor(enum.Enum):
LinearCombination = enum_auto()
LinearCombinationClamp = enum_auto()
FastLinearCombinationClamp = enum_auto()
#
EpilogueFunctorTag = {
EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination',
EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp',
EpilogueFunctor.FastLinearCombinationClamp: 'cutlass::epilogue::thread::FastLinearCombinationClamp'
}
#
class SwizzlingFunctor(enum.Enum):
Identity1 = enum_auto()
Identity2 = enum_auto()
Identity4 = enum_auto()
Identity8 = enum_auto()
Horizontal = enum_auto()
BatchedIdentity1 = enum_auto()
StridedDgradIdentity1 = enum_auto()
StridedDgradIdentity4 = enum_auto()
StridedDgradHorizontal = enum_auto()
#
SwizzlingFunctorTag = {
cutlass.IdentitySwizzle1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
SwizzlingFunctor.Horizontal: 'cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle',
SwizzlingFunctor.BatchedIdentity1: "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle",
SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
}
#
class SchedulerMode(enum.Enum):
Device = enum_auto(),
Host = enum_auto()
#
SchedulerModeTag = {
SchedulerMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
SchedulerMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
}
#
ShortSchedulerModeNames = {
SchedulerMode.Device: 'Device',
SchedulerMode.Host: 'Host'
}
###################################################################################################
#
ConvKindTag = {
cutlass.conv.Operator.fprop: 'cutlass::conv::Operator::kFprop',
cutlass.conv.Operator.dgrad: 'cutlass::conv::Operator::kDgrad',
cutlass.conv.Operator.wgrad: 'cutlass::conv::Operator::kWgrad'
}
ConvKindNames = {
cutlass.conv.Operator.fprop: 'fprop',
cutlass.conv.Operator.dgrad: 'dgrad',
cutlass.conv.Operator.wgrad: 'wgrad',
}
#
IteratorAlgorithmTag = {
cutlass.conv.IteratorAlgorithm.analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
cutlass.conv.IteratorAlgorithm.optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
cutlass.conv.IteratorAlgorithm.fixed_channels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
cutlass.conv.IteratorAlgorithm.few_channels: 'cutlass::conv::IteratorAlgorithm::kFewChannels'
}
IteratorAlgorithmNames = {
cutlass.conv.IteratorAlgorithm.analytic: 'analytic',
cutlass.conv.IteratorAlgorithm.optimized: 'optimized',
cutlass.conv.IteratorAlgorithm.fixed_channels: 'fixed_channels',
cutlass.conv.IteratorAlgorithm.few_channels: 'few_channels'
}
#
class StrideSupport(enum.Enum):
Strided = enum_auto()
Unity = enum_auto()
#
StrideSupportTag = {
StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
}
StrideSupportNames = {
StrideSupport.Strided: '',
StrideSupport.Unity: 'unity_stride',
}
class ConvMode(enum.Enum):
CrossCorrelation = enum_auto()
Convolution = enum_auto()
#
ConvModeTag = {
ConvMode.CrossCorrelation: 'cutlass::conv::Mode::kCrossCorrelation',
ConvMode.Convolution: 'cutlass::conv::Mode::kConvolution'
}
###################################################################################################
#
class MathInstruction:
def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class=cutlass.OpClass.Simt, math_operation=MathOperation.multiply_add):
self.instruction_shape = instruction_shape
self.element_a = element_a
self.element_b = element_b
self.element_accumulator = element_accumulator
self.opcode_class = opcode_class
self.math_operation = math_operation
#
class TileDescription:
def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute):
self.threadblock_shape = threadblock_shape
#: number of pipeline stages
self.stages: int = stages
#: number of warps along x, y, z directions
self.warp_count: list[int] = warp_count
self.math_instruction = math_instruction
#: minimum compute capability
self.minimum_compute_capability: int = min_compute
#: maximum compute capability
self.maximum_compute_capability: int = max_compute
#: number threads per threadblock
self.num_threads: int = 32
for cnt in self.warp_count:
self.num_threads *= cnt
def procedural_name(self):
return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
#
class TensorDescription:
def __init__(self, element, layout, alignment=1, complex_transform=cutlass.complex_transform.none):
self.element = element
self.layout = layout
self.alignment = min(128 // DataTypeSize[self.element], alignment)
self.complex_transform = complex_transform
#
class SymmetricTensorDescription:
def __init__(self, element, layout, fill_mode, alignment=1, complex_transform=cutlass.complex_transform.none, side_mode=SideMode.Left):
self.element = element
self.layout = layout
self.fill_mode = fill_mode
self.alignment = alignment
self.complex_transform = complex_transform
self.side_mode = side_mode
#
class TriangularTensorDescription:
def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment=1, complex_transform=cutlass.complex_transform.none):
self.element = element
self.layout = layout
self.side_mode = side_mode
self.fill_mode = fill_mode
self.diag_type = diag_type
self.alignment = alignment
self.complex_transform = complex_transform
###################################################################################################
#
def CalculateSmemUsage(operation):
cta_shape = operation.tile_description.threadblock_shape
stages = operation.tile_description.stages
if operation.operation_kind == OperationKind.Gemm and operation.gemm_kind == GemmKind.Sparse:
# Elements represented by 8 bits of metadata (based on 4:8, 2:4 or 1:2 sparsity)
if DataTypeSize[operation.A.element] == 32:
elements_per_8b_md = 2
elif DataTypeSize[operation.A.element] == 4:
elements_per_8b_md = 8
else:
elements_per_8b_md = 4
smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * (cta_shape[2] // 2) // 8 + \
DataTypeSize[operation.B.element] * cta_shape[1] * cta_shape[2] // 8 + \
cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
else:
# Few BLAS3 operations only have A tensor
smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * cta_shape[2] // 8 + \
DataTypeSize[operation.A.element] * \
cta_shape[1] * cta_shape[2] // 8
smem_usage = smem_per_stage * stages
return (smem_usage >> 10)
###################################################################################################

View File

@ -0,0 +1,74 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import rmm
import numpy as np
class PoolMemoryManager:
def __init__(self, init_pool_size: int, max_pool_size: int) -> None:
self.pool = rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(),
initial_pool_size=init_pool_size,
maximum_pool_size=max_pool_size
)
self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
rmm.mr.set_current_device_resource(self.mr)
def get_allocated_size(self):
return self.mr.get_allocated_bytes()
def pool_size(self):
return self.pool.pool_size()
def todevice(host_data, dtype=np.float32):
"""
Pass the host_data to device memory
"""
if isinstance(host_data, list):
return rmm.DeviceBuffer.to_device(np.array(host_data, dtype=dtype).tobytes())
elif isinstance(host_data, np.ndarray):
return rmm.DeviceBuffer.to_device(host_data.tobytes())
def device_mem_alloc(size):
return rmm.DeviceBuffer(size=size)
def align_size(size, alignment=256):
return ((size + alignment - 1) // alignment) * alignment
def get_allocated_size():
device_resource = rmm.mr.get_current_device_resource()
return device_resource.get_allocated_bytes()

View File

@ -0,0 +1,110 @@
################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
import ctypes
from cuda import cuda
################################################################################
#
# Launch configuration
#
################################################################################
class LaunchConfiguration:
def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
self.grid = grid
self.block = block
self.shared_memory_capacity = smem
################################################################################
#
# Base class for an executable operation
#
# ##############################################################################
class ExecutableOperation:
'''
'''
def __init__(self, operation):
self.operation = operation
self.module = None
self.kernel = None
#
def name(self):
return self.operation.procedural_name()
#
def emit(self):
return ''
#
def can_implement(self, configuration, arguments):
raise NotImplementedError()
#
def get_host_workspace_size(self, arguments):
raise NotImplementedError()
#
def get_device_workspace_size(self, arguments):
raise NotImplementedError()
#
def plan(self, arguments):
raise NotImplementedError()
#
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=cuda.CUstream(0)):
raise NotImplementedError()
#
def run(self, host_workspace, device_workspace, launch_config, stream=cuda.CUstream(0)):
cArg = (ctypes.c_char * len(host_workspace)
).from_buffer(host_workspace)
packed = (ctypes.c_void_p * 1)()
packed[0] = ctypes.addressof(cArg)
err, = cuda.cuLaunchKernel(
self.kernel,
launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
launch_config.block[0], launch_config.block[1], launch_config.block[2],
launch_config.shared_memory_capacity,
stream,
packed,
0)
return err

View File

@ -0,0 +1,402 @@
################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from pycutlass import *
from pycutlass.c_types import get_reduction_params
import cutlass
from cuda import cuda
try:
import torch
torch_available = True
except ImportError:
torch_available = False
import numpy as np
from typing import Union
from cuda import cudart
class ReductionOperation:
pass
class ReductionArguments:
"""
Arguments of reduction
"""
def __init__(self, operation: ReductionOperation,
problem_size: 'list[int]', partitions: int,
workspace: cuda.CUdeviceptr,
destination: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
source: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]', **kwargs) -> None:
self.operation = operation
#: pointer to the workspace
self.ptr_workspace = workspace
#: number of split-k partitions
self.partitions = partitions
if isinstance(destination, np.ndarray):
self.host_D = destination
self.destination_buffer = NumpyFrontend.argument(destination, True)
self.source_buffer = NumpyFrontend.argument(source, False)
self.ptr_destination = cuda.CUdeviceptr(
self.destination_buffer.ptr)
self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
elif torch_available and isinstance(destination, torch.Tensor):
self.ptr_destination = TorchFrontend.argument(destination)
self.ptr_source = TorchFrontend.argument(source)
elif isinstance(destination, cuda.CUdeviceptr):
self.ptr_destination = destination
self.ptr_source = source
else:
raise TypeError("unknown Type")
self.problem_size = MatrixCoord_(
problem_size[0], problem_size[1]
)
self.partition_stride = problem_size[0] * \
problem_size[1] * DataTypeSize[operation.C.element] // 8
if "output_op" in kwargs.keys():
self.alpha = kwargs["output_op"].alpha
self.beta = kwargs["output_op"].beta
else:
self.alpha = 1.0
self.beta = 0.0
# get arguments
self.get_arguments()
@staticmethod
def get_tensor_ref(extent: 'tuple[int]', device_ptr: cuda.CUdeviceptr, layout: cutlass.layout):
if layout == cutlass.RowMajor:
return TensorRef2D_(int(device_ptr), extent[1])
else:
raise ValueError("unknonwn layout type")
def get_arguments(self):
ref_workspace = ReductionArguments.get_tensor_ref(
extent=[self.problem_size.row, self.problem_size.column],
device_ptr=self.ptr_workspace, layout=cutlass.RowMajor)
ref_source = ReductionArguments.get_tensor_ref(
extent=[self.problem_size.row, self.problem_size.column],
device_ptr=self.ptr_source, layout=cutlass.RowMajor)
ref_destination = ReductionArguments.get_tensor_ref(
extent=[self.problem_size.row, self.problem_size.column],
device_ptr=self.ptr_destination, layout=cutlass.RowMajor)
argument_type, epilogue_type = get_reduction_params(
self.operation.element_compute)
if self.operation.element_compute == cutlass.float16:
self.alpha = cutlass.float16(self.alpha).storage
self.beta = cutlass.float16(self.beta).storage
elif self.operation.element_compute == cutlass.int32:
self.alpha = int(self.alpha)
self.beta = int(self.beta)
output_op = epilogue_type(self.alpha, self.beta, 0, 0)
self.c_arguments = argument_type(
self.problem_size, self.partitions,
self.partition_stride, ref_workspace,
ref_destination, ref_source,
output_op
)
params_ = self.operation.rt_module.get_args(
ctypes.byref(self.c_arguments))
self.host_workspace = bytearray(params_.contents)
def sync(self):
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if hasattr(self, "host_D"):
err, = cuda.cuMemcpyDtoH(
self.host_D, self.ptr_destination, self.host_D.size * self.host_D.itemsize)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
def free(self):
if hasattr(self, "destination_buffer"):
del self.destination_buffer
if hasattr(self, "source_buffer"):
del self.source_buffer
class ReductionRT(ExecutableOperation):
"""
ReductionRT manages the CUTLASS runtime components for reduction
"""
KernelTemplate = r'''
extern "C"
__global__ void
${operation_name}(${operation_name}${operation_suffix}::Params params) {
// Dynamic shared memory base pointer
extern __shared__ int SharedStorageBase[];
// Declare pointer to dynamic shared memory.
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
${operation_name}${operation_suffix} op;
op(params, *shared_storage);
}
'''
HostTemplate = r'''
extern "C" {
// Get the size of params in bytes
int ${operation_name}_get_param_size(){
return sizeof(${operation_name}${operation_suffix}::Params);
}
// Get the size of dynamic shared memory in bytes
int ${operation_name}_shared_memory_size() {
return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
}
// Get the params as byte array
char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
char *bytes = ((char*)(params));
char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
output[i] = bytes[i];
return output;
}
}
'''
def __init__(self, operation: ReductionOperation):
super().__init__(operation)
self.operation: ReductionOperation = operation
self.emitter = EmitReductionInstance('_type')
self.elements_per_access = self.operation.count
self.argtype = [ctypes.POINTER(
get_reduction_params(operation.element_compute)[0])]
def emit(self):
return self.emitter.emit(self.operation)
def plan(self, arguments: ReductionArguments):
block_shape = [self.operation.shape.column(
) // self.elements_per_access, self.operation.shape.row(), 1]
grid_shape = [
(arguments.problem_size.row + self.operation.shape.row() -
1) // self.operation.shape.row(),
(arguments.problem_size.column + self.operation.shape.column() -
1) // self.operation.shape.column(),
1
]
return LaunchConfiguration(grid_shape, block_shape, self.shared_memory_capacity)
def initialize(self):
err, = cuda.cuFuncSetAttribute(
self.kernel,
attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
value=self.shared_memory_capacity)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
class ReductionOperation:
"""
CUTLASS Reduction Operation
shape: shape of CTA
outputop: output operator
r
"""
def __init__(self, shape: cutlass.MatrixCoord, C: TensorDescription,
element_accumulator, element_workspace=None,
element_compute=None, epilogue_functor: EpilogueFunctor = EpilogueFunctor.LinearCombination,
count: int = 1, partitions_per_stage: int = 4) -> None:
""" Constructor
"""
self.shape = shape
#: epilogue functor (default: LinearCombination)
self.epilogue_functor: EpilogueFunctor = epilogue_functor
#: datatype of accumulator
self.element_accumulator = element_accumulator
if element_workspace is None:
#: datatype of workspace
self.element_workspace = element_accumulator
else:
#: datatype of workspace
self.element_workspace = element_workspace
if element_compute is None:
#: datatype of workspace
self.element_compute = element_accumulator
else:
#: datatype of workspace
self.element_compute = element_compute
#: datatype of output
self.element_output = C.element
#: operand C
self.C: TensorDescription = C
#: reduce op processing size
self.count: int = count
#: number of partitions to reduce per stage
self.partitions_per_stage: int = partitions_per_stage
self.rt_module: ReductionRT = ReductionRT(self)
#
def extended_name(self):
extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
return SubstituteTemplate(extend_name,
{
'element_workspace': DataTypeNames[self.element_workspace],
'element_accumulator': DataTypeNames[self.element_accumulator],
'element_compute': DataTypeNames[self.element_compute],
'element_output': DataTypeNames[self.element_output]
})
#
def configuration_name(self):
''' The full procedural name indicates architecture, extended name, tile size'''
configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
threadblock = "%dx%d" % (
self.shape.row(),
self.shape.column()
)
return SubstituteTemplate(
configuration_name,
{
'extended_name': self.extended_name(),
'threadblock': threadblock
}
)
#
def procedural_name(self):
''' The full procedural name indicates architeture, extended name, tile size'''
return self.configuration_name()
def run(self, arguments: ReductionArguments) -> cuda.CUresult:
"""
Configure and launch the cuda kernel with input arguments
"""
# get launch configuration
launch_config = self.rt_module.plan(arguments)
# get the host and device workspace
host_workspace = arguments.host_workspace
device_workspace = None
# launch the kernel
err = self.rt_module.run(
host_workspace, device_workspace, launch_config)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('CUDA Error %s' % str(err))
return err
class EmitReductionInstance:
def __init__(self, operation_suffix='') -> None:
self.operation_suffix = operation_suffix
self.includes = [
"cutlass/cutlass.h",
"cutlass/numeric_types.h",
"cutlass/arch/arch.h",
"cutlass/arch/mma.h",
"cutlass/layout/matrix.h",
"cutlass/gemm/device/gemm.h",
"cutlass/gemm/device/gemm_universal_adapter.h",
"cutlass/gemm/kernel/default_gemm_universal.h",
"cutlass/reduction/kernel/reduce_split_k.h",
"cutlass/reduction/thread/reduction_operators.h"
]
self.template = """
// Reduction kernel instance
using ${operation_name}_base =
typename cutlass::reduction::kernel::ReduceSplitK<
cutlass::MatrixShape<${shape_row}, ${shape_column}>,
${epilogue_functor}<
${element_output},
${epilogue_vector_length},
${element_accumulator},
${element_compute}
>,
cutlass::reduction::thread::ReduceAdd<
${element_accumulator},
${element_output},
${count}>,
${partition_per_stage}>;
struct ${operation_name}${operation_suffix}:
public ${operation_name}_base { };
"""
def emit(self, operation: ReductionOperation):
epilogue_vector_length = int(min(
operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
values = {
'operation_name': operation.configuration_name(),
'operation_suffix': self.operation_suffix,
'shape_row': str(operation.shape.row()),
'shape_column': str(operation.shape.column()),
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
'element_output': DataTypeTag[operation.element_output],
'epilogue_vector_length': str(epilogue_vector_length),
'element_accumulator': DataTypeTag[operation.element_accumulator],
'element_compute': DataTypeTag[operation.element_compute],
'element_workspace': DataTypeTag[operation.element_workspace],
'count': str(operation.count),
'partition_per_stage': str(operation.partitions_per_stage)
}
return SubstituteTemplate(self.template, values)

View File

@ -0,0 +1,71 @@
################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from typeguard import typechecked
import numpy as np
try:
import torch
torch_available = True
except ImportError:
torch_available = False
from cuda import cuda
try:
import cupy as cp
cupy_available = True
except ImportError:
cupy_available = False
import cutlass
# @typechecked
class TensorRef:
"""
Python Wrapper for cutlass.TensorRef
"""
def __init__(self, tensor, dtype, layout) -> None:
if isinstance(tensor, np.ndarray):
ptr = cuda.CUdeviceptr(tensor.__array_interface__['data'][0])
elif torch_available and isinstance(tensor, torch.Tensor):
ptr = cuda.CUdeviceptr(tensor.data_ptr())
elif cupy_available and isinstance(tensor, cp.ndarray):
ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
elif isinstance(tensor, cuda.CUdeviceptr):
ptr = tensor
elif isinstance(tensor, int):
ptr = cuda.CUdeviceptr(tensor)
else:
raise NotImplementedError(tensor)
# the dtype(0) is used to overload between different data types
# with the same layout
self.tensor_ref = cutlass.get_tensor_ref(int(ptr), dtype(0), layout)

View File

@ -0,0 +1,4 @@
from pycutlass.test.profiler import *
from pycutlass.test.conv2d_testbed import *
from pycutlass.test.gemm_testbed import *
from pycutlass.test.gemm_grouped_testbed import *

View File

@ -0,0 +1,646 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pycutlass
from pycutlass import *
from pycutlass.test import *
from time import sleep
from bfloat16 import bfloat16
import subprocess
from typeguard import typechecked
import re
def getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand):
ptr = tensor.__array_interface__['data'][0]
if operand == "a":
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
elif operand == "b":
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
elif operand in ["c", "d"]:
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
else:
raise ValueError("unknown operand: " + operand)
layout = tensor_layout.packed(tensor_coord)
if tensor.dtype == np.float64:
return cutlass.TensorRefF64NHWC(ptr, layout)
elif tensor.dtype == np.float32:
return cutlass.TensorRefF32NHWC(ptr, layout)
elif tensor.dtype == np.float16:
return cutlass.TensorRefF16NHWC(ptr, layout)
if tensor.dtype == bfloat16:
return cutlass.TensorRefBF16NHWC(ptr, layout)
elif tensor.dtype == np.int32:
return cutlass.TensorRefS32NHWC(ptr, layout)
elif tensor.dtype == np.int8:
if tensor_layout == cutlass.TensorNC32HW32:
return cutlass.TensorRefS8NC32HW32(ptr, layout)
elif tensor_layout == cutlass.TensorC32RSK32:
return cutlass.TensorRefS8C32RSK32(ptr, layout)
else:
return cutlass.TensorRefS8NHWC(ptr, layout)
else:
raise ValueError("unsupported data type")
def getTensorView(tensor, tensor_layout, conv_kind, problem_size, operand):
tensor_ref = getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand)
if operand == "a":
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
elif operand == "b":
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
elif operand in ["c", "d"]:
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
else:
raise ValueError("unknown operand: " + operand)
if tensor.dtype == np.float64:
return cutlass.TensorViewF64NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.float32:
return cutlass.TensorViewF32NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.float16:
return cutlass.TensorViewF16NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == bfloat16:
return cutlass.TensorViewBF16NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.int32:
return cutlass.TensorViewS32NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.int8:
if tensor_layout == cutlass.TensorNC32HW32:
return cutlass.TensorViewS8NC32HW32(tensor_ref, tensor_coord)
elif tensor_layout == cutlass.TensorC32RSK32:
return cutlass.TensorViewS8C32RSK32(tensor_ref, tensor_coord)
else:
return cutlass.TensorViewS8NHWC(tensor_ref, tensor_coord)
else:
raise ValueError("unsupported data type")
# @typechecked
class Conv2dLauncher:
"""
Launcher that runs the operation on given problem size
"""
def __init__(self, operation: 'Conv2dOperation', seed: int=2080, interleaved=False,
verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
self.enable_cached_results = True
self.interleaved = interleaved
# create the reduction kernel
self.reduction_operation = ReductionOperation(
shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
element_compute=operation.element_epilogue,
count=operation.C.alignment
)
#: verify the output result
self.verification = verification
#: profile the kernel's runtime
self.profiling = profiling
self.timer = GpuTimer()
self.warmup_iterations = warmup_iterations
self.iterations = iterations
if "sleep" in kwargs.keys():
self.sleep_time = kwargs["sleep"]
else:
self.sleep_time = 0
#
# Compile the operator
#
pycutlass.compiler.add_module([operation, self.reduction_operation])
self.operation = operation
self.dtype_A = Conv2dLauncher.numpy_type(operation.A.element)
self.layout_A = operation.A.layout
self.dtype_B = Conv2dLauncher.numpy_type(operation.B.element)
self.layout_B = operation.B.layout
self.dtype_C = Conv2dLauncher.numpy_type(operation.C.element)
self.layout_C = operation.C.layout
self.dtype_D = Conv2dLauncher.numpy_type(operation.C.element)
self.layout_D = operation.C.layout
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
element_size = DataTypeSize[operation.A.element]
if element_size <= 8:
self.scope = 1
elif element_size == 16:
if accumulator_size <= 16:
self.scope = 2
else:
self.scope = 4
else:
self.scope = 7
# Seed
self.seed = seed
self.conv_kind = operation.conv_kind
#
# Get the host reference function
#
self.element_compute = operation.element_epilogue
self.host_conv2d = cutlass.test.conv.host.conv2d
self.timer = GpuTimer()
@staticmethod
def numpy_type(type):
if type == cutlass.float64:
return np.float64
elif type == cutlass.float32:
return np.float32
elif type == cutlass.float16:
return np.float16
elif type == cutlass.bfloat16:
return bfloat16
elif type == cutlass.int32:
return np.int32
elif type == cutlass.int8:
return np.int8
else:
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
def print_problem_size(self, p, split_k_mode=1):
print("nhwc_%dx%dx%dx%d_krsc_%dx%dx%dx%d_padding_%dx%d_stride_%dx%d_dilation_%dx%d_splitkslices_%d_splitkmode_%d"
% (p.N, p.H, p.W, p.C, p.K, p.R, p.S, p.C, p.pad_h,
p.pad_w, p.stride_h, p.stride_w, p.dilation_h, p.dilation_w, p.split_k_slices, split_k_mode))
def uniform_init(self, size, dtype):
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
return np.ceil(
np.random.uniform(
low=-self.scope - 0.5, high=self.scope - 0.5,
size=size).astype(dtype)
)
else:
return np.random.uniform(
low=-self.scope - 1, high=self.scope + 1,
size=size).astype(dtype)
def eq_gemm_size(self, problem_size):
n = problem_size.N
p = problem_size.P
q = problem_size.Q
k = problem_size.K
r = problem_size.R
s = problem_size.S
c = problem_size.C
h = problem_size.H
w = problem_size.W
if self.conv_kind == cutlass.conv.Operator.fprop:
return cutlass.gemm.GemmCoord(n * p * q, k, r * s * c)
elif self.conv_kind == cutlass.conv.Operator.dgrad:
return cutlass.gemm.GemmCoord(n * h * w, c, k * r * s)
else:
return cutlass.gemm.GemmCoord(k, r * s * c, n * p * q)
def bytes(self, problem_size, alpha, beta):
mnk = self.eq_gemm_size(problem_size)
bytes_ = \
(DataTypeSize[self.operation.A.element] * mnk.m() // 8) * mnk.k() + \
(DataTypeSize[self.operation.B.element] * mnk.n() // 8) * mnk.k() + \
(DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
if beta != 0:
bytes_ += (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
return bytes_
def flops(self, problem_size):
mnk = self.eq_gemm_size(problem_size)
flops_mainloop_ = mnk.m() * mnk.n() * mnk.k() * 2
flops_epilogue_ = mnk.m() * mnk.n() * 2
# Adjust mainloop flop for dgrad stride
if self.conv_kind == cutlass.conv.Operator.dgrad:
flops_mainloop_ = flops_mainloop_ // (problem_size.stride_h * problem_size.stride_w)
flops_total_ = flops_mainloop_ + flops_epilogue_
# TODO complex-value support
# switch (operation_desc.tile_description.math_instruction.math_operation) {
# case library::MathOperationID::kMultiplyAddComplex:
# flops_total_ *=4;
# break;
# default: break;
# }
return flops_total_
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
if self.element_compute == cutlass.float16:
alpha = cutlass.float16(alpha)
beta = cutlass.float16(beta)
elif self.element_compute == cutlass.int32:
alpha = int(alpha)
beta = int(beta)
else:
alpha = alpha
beta = beta
# if cached result is loaded
cached_result_loaded = False
if self.enable_cached_results:
# get problem key
cached_test_key = cutlass.test.conv.host.CreateCachedConv2dTestKey(
self.conv_kind, problem_size, alpha, beta,
getTensorView(tensor_A, self.layout_A, self.conv_kind, problem_size, "a"),
getTensorView(tensor_B, self.layout_B, self.conv_kind, problem_size, "b"),
getTensorView(tensor_C, self.layout_C, self.conv_kind, problem_size, "c"),
)
cached_test_result = cutlass.test.conv.host.CachedTestResult()
conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (self.operation.arch, self.seed)
cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
# CachedTestResultListing cached_results(conv2d_result_cache_name);
cached = cached_results.find(cached_test_key)
cached_result_loaded = cached[0]
if cached_result_loaded :
cached_test_result = cached[1]
if not cached_result_loaded:
# compute the conv2d on host
tensor_D_ref = np.ones_like(tensor_C)
tensor_ref_A = getTensorRef(tensor_A, self.layout_A, self.conv_kind, problem_size, "a")
tensor_ref_B = getTensorRef(tensor_B, self.layout_B, self.conv_kind, problem_size, "b")
tensor_ref_C = getTensorRef(tensor_C, self.layout_C, self.conv_kind, problem_size, "c")
tensor_ref_D_ref = getTensorRef(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
self.host_conv2d(
self.conv_kind, problem_size,
tensor_ref_A, tensor_ref_B, tensor_ref_C, tensor_ref_D_ref,
alpha, beta
)
tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
if self.enable_cached_results:
cached_test_result.D = cutlass.test.conv.host.TensorHash(tensor_view_D_ref)
cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
cached_results.append(cached_test_key, cached_test_result)
cached_results.write(conv2d_result_cache_name)
else:
return tensor_D_ref
return cached_test_result.D
def equal(self, tensor_D, tensor_D_ref, problem_size):
if self.enable_cached_results:
tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
tensor_D_hash = cutlass.test.conv.host.TensorHash(tensor_view_D)
return tensor_D_hash == tensor_D_ref
else:
tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
return cutlass.test.conv.host.equals(tensor_view_D, tensor_view_D_ref)
def run_cutlass_profiler(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial, alpha=1.0, beta=0.0):
if split_k_mode == cutlass.conv.SplitKMode.Serial:
split_k_mode_ = "serial"
else:
split_k_mode_ = "parallel"
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
values = {
"profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
"kernel_name": self.operation.procedural_name(),
"verification_providers": "device",
"provider": "cutlass",
'n': str(problem_size.N),
'h': str(problem_size.H),
'w': str(problem_size.W),
'c': str(problem_size.C),
'k': str(problem_size.K),
'r': str(problem_size.R),
's': str(problem_size.S),
'p': str(problem_size.P),
'q': str(problem_size.Q),
'pad_h': str(problem_size.pad_h),
'pad_w': str(problem_size.pad_w),
'stride_h': str(problem_size.stride_h),
'stride_w': str(problem_size.stride_w),
'dilation_h': str(problem_size.dilation_h),
'dilation_w': str(problem_size.dilation_w),
'split_k_slices': str(problem_size.split_k_slices),
'split_k_mode': split_k_mode_,
'alpha': str(alpha),
'beta': str(beta),
'warmup': str(self.warmup_iterations),
'profile': str(self.iterations)
}
cmd_template = \
"${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
" --providers=${provider} --n=${n} --h=${h} --w=${w} --c=${c} --k=${k} --r=${r} --s=${s} --p=${p}" \
" --q=${q} --pad_h=${pad_h} --pad_w=${pad_w} --stride_h={stride_h} --stride_w=${stride_w}" \
" --dilation_h=${dilation_h} --dilation_w=${dilation_w} --warmup-iterations=${warmup} --profiling-iterations=${profile}" \
" --split_k_slices=${split_k_slices} --alpha=${alpha} --beta=${beta} --split_k_mode=${split_k_mode}"
cmd = SubstituteTemplate(cmd_template, values)
result = subprocess.getoutput(cmd)
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
runtime = float(m.group('runtime'))
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
bytes = int(m.group('bytes'))
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
flops = int(m.group('flops'))
# check if the problem size matches
assert bytes == self.bytes(problem_size, alpha, beta)
assert flops == self.flops(problem_size)
return runtime
def run(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial,
alpha=1.0, beta=0.0):
assert get_allocated_size() == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
#
# Initialize input and output tensors
#
tensor_A_size = cutlass.conv.implicit_gemm_tensor_a_size(self.conv_kind, problem_size)
tensor_B_size = cutlass.conv.implicit_gemm_tensor_b_size(self.conv_kind, problem_size)
tensor_C_size = cutlass.conv.implicit_gemm_tensor_c_size(self.conv_kind, problem_size)
np.random.seed(self.seed)
tensor_A = self.uniform_init(size=(tensor_A_size,), dtype=self.dtype_A)
tensor_B = self.uniform_init(size=(tensor_B_size,), dtype=self.dtype_B)
tensor_C = self.uniform_init(size=(tensor_C_size,), dtype=self.dtype_C)
tensor_D = np.zeros(shape=(tensor_C_size,), dtype=self.dtype_D)
#
# Launch kernel
#
arguments = Conv2dArguments(
operation=self.operation, problem_size=problem_size, A=tensor_A,
B=tensor_B, C=tensor_C, D=tensor_D,
output_op = LinearCombinationFunctorArguments(alpha, beta),
split_k_slices=problem_size.split_k_slices,
split_k_mode=split_k_mode
)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
implicit_gemm_size = cutlass.conv.implicit_gemm_problem_size(self.operation.conv_kind, arguments.problem_size)
reduction_arguments = ReductionArguments(
self.reduction_operation,
problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()], partitions=problem_size.split_k_slices,
workspace=arguments.ptr_D,
destination=tensor_D,
source=tensor_C,
output_op = LinearCombinationFunctorArguments(alpha, beta)
)
self.operation.run(arguments)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.reduction_operation.run(reduction_arguments)
passed = True
if self.verification:
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
reduction_arguments.sync()
else:
arguments.sync()
tensor_D_ref = self.host_reference(problem_size, tensor_A, tensor_B, tensor_C, alpha, beta)
passed = self.equal(tensor_D, tensor_D_ref, problem_size)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size, split_k_mode)
if self.profiling:
sleep(self.sleep_time)
for _ in range(self.warmup_iterations):
self.operation.run(arguments)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.reduction_operation.run(reduction_arguments)
self.timer.start()
for _ in range(self.warmup_iterations):
self.operation.run(arguments)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.reduction_operation.run(reduction_arguments)
self.timer.stop_and_wait()
runtime = self.timer.duration(self.iterations)
# free memory
del arguments
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
del reduction_arguments
assert get_allocated_size() == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
if self.profiling:
return runtime
return passed
########################################################################################################
# TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
# TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
# Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
# (conv_blacklist_sizes)
############################################################################################################
def test_all_conv2d(operation: Conv2dOperation, conv_test_sizes = [], interleaved=False): # TODO: conv_test_sizes and conv_blacklist_sizes
passed = True
#
# Testbed object
#
testbed = Conv2dLauncher(operation, interleaved=interleaved)
#
# Get conv problem sizes to run conv operator
#
conv_problems = cutlass.test.conv.TestbedConv2dProblemSizes(64)
# Vector of conv2d problem sizes to avoid duplicate runs
conv_tested_sizes = []
# TODO: include resnet 50 sizes, user sepecified sizes, and rigorous sizes
# Flatten 2D problem_vectors into a 1D problem sizes
problem_sizes = conv_problems.conv2d_default_sizes
problem_sizes = [conv_problem for conv_problem in problem_sizes] + conv_test_sizes
# Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slices=1, alpha=1.0, beta=0.0)
for conv_problem in problem_sizes:
# TODO: skip blacklist problem sizes
if conv_problem in conv_tested_sizes:
continue
# skip channel dimension % 32 != 0 for interleaved case
if interleaved:
if conv_problem.K % 32 != 0 or conv_problem.C % 32 != 0:
continue
#
# Procedurally disable certain cases
#
# CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Unity:
if not ((conv_problem.stride_h == 1) and (conv_problem.stride_w == 1)):
continue
if not interleaved:
# Fixed channels algorithm requires channel count to match access size
if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.fixed_channels:
if conv_problem.C != operation.A.alignment:
continue
# Few channels algorithm requires channel count to match access size
if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.few_channels:
if conv_problem.C % operation.A.alignment:
continue
# CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
# Although strided dgrad works for all stride combinations, we are only going
# to run strided dgrad for non-unity strides
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
if (conv_problem.stride_h == 1) and (conv_problem.stride_w == 1):
continue
#
# Test
#
# push back tested problem size to avoid re-running duplicates
conv_tested_sizes.append(conv_problem)
passed = testbed.run(conv_problem)
# if not passed: return False
# TODO: If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
if interleaved:
return True
#
# filter the cases for split K
#
# Small-channels convolution can't run here.
if operation.iterator_algorithm in [cutlass.conv.IteratorAlgorithm.fixed_channels, cutlass.conv.IteratorAlgorithm.few_channels]:
return True
# CUTLASS DGRAD's *stride* specialization does not support split-k mode
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
conv_problem = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 56, 56, 8),
cutlass.Tensor4DCoord(8, 1, 1, 8),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
)
passed = testbed.run(conv_problem)
return passed
# Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
# a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
# which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
# alpha and beta for local testing, but only runs one value for alpha and beta.
conv2d_split_k_test_size = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 17, 11, 288),
cutlass.Tensor4DCoord(160, 3, 3, 288),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
)
split_k_modes = [cutlass.conv.SplitKMode.Parallel, cutlass.conv.SplitKMode.Serial]
split_k_slices = [1, 2, 3, 4, 201]
problem_alpha = [2.0,]
problem_beta = [2.0,]
for split_k_mode in split_k_modes:
for split_k_slice in split_k_slices:
for alpha in problem_alpha:
for beta in problem_beta:
passed = testbed.run(conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
split_k_mode,
alpha, beta)
return passed

View File

@ -0,0 +1,235 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pycutlass
from pycutlass.test.gemm_testbed import getTensorRef, getTensorView, transpose
from pycutlass import *
import numpy as np
import cutlass
from bfloat16 import bfloat16
class TestbedGrouped:
def __init__(self, operation: GemmOperationGrouped, seed: int = 2080) -> None:
pycutlass.compiler.add_module([operation])
self.seed = seed
self.operation = operation
element_size = DataTypeSize[operation.A.element]
self.dtype_A = self.numpy_type(operation.A.element)
self.dtype_B = self.numpy_type(operation.B.element)
self.dtype_C = self.numpy_type(operation.C.element)
self.dtype_D = self.numpy_type(operation.C.element)
if element_size == 1:
self.scope_max = 1
self.scope_min = 0
elif element_size <= 8:
self.scope_max = 1
self.scope_min = -1
elif element_size == 16:
self.scope_max = 4
self.scope_min = -4
else:
self.scope_max = 8
self.scope_min = -8
#: compute type
self.compute_type = operation.element_epilogue
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
@staticmethod
def numpy_type(type):
if type == cutlass.float64:
return np.float64
elif type == cutlass.float32:
return np.float32
elif type == cutlass.float16:
return np.float16
elif type == cutlass.bfloat16:
return bfloat16
elif type == cutlass.int32:
return np.int32
elif type == cutlass.int8:
return np.int8
else:
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
def uniform_init(self, size, dtype):
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
return np.ceil(
np.random.uniform(
low=self.scope_min - 0.5, high=self.scope_max - 0.5,
size=size).astype(dtype)
)
else:
return np.random.uniform(
low=self.scope_min - 1, high=self.scope_max + 1,
size=size).astype(dtype)
def print_problem_size(self, p):
problem_size = "problem: %d, %d, %d\n" % (p.m(), p.n(), p.k())
print(problem_size)
def run(self, problem_count: int, alpha: float = 1.0, beta: float = 0.0) -> bool:
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
# initialize
np.random.seed(self.seed)
# generate the problem sizes
problem_sizes = []
tensor_As = []
tensor_Bs = []
tensor_Cs = []
tensor_Ds = []
tensor_D_refs = []
for i in range(problem_count):
if self.dtype_A == np.int8:
if i == 0:
problem_size = cutlass.gemm.GemmCoord(48, 16, 32)
else:
problem_size = cutlass.gemm.GemmCoord(
16 * np.random.randint(0, 64) + 48,
16 * np.random.randint(0, 64) + 48,
16 * np.random.randint(0, 64) + 48
)
else:
if i == 0:
problem_size = cutlass.gemm.GemmCoord(48, 16, 8)
else:
problem_size = cutlass.gemm.GemmCoord(
8 * np.random.randint(0, 64) + 24,
8 * np.random.randint(0, 64) + 24,
8 * np.random.randint(0, 64) + 24
)
tensor_As.append(
self.uniform_init(
size=(problem_size.m() * problem_size.k(),),
dtype=self.dtype_A)
)
tensor_Bs.append(
self.uniform_init(
size=(problem_size.n() * problem_size.k(),),
dtype=self.dtype_B)
)
tensor_Cs.append(
self.uniform_init(
size=(problem_size.m() * problem_size.n(),),
dtype=self.dtype_C)
)
tensor_Ds.append(
np.zeros(
shape=(problem_size.m() * problem_size.n(),),
dtype=self.dtype_D
)
)
tensor_D_refs.append(
np.ones(
shape=(problem_size.m() * problem_size.n(),),
dtype=self.dtype_D
)
)
problem_sizes.append(problem_size)
arguments = GemmGroupedArguments(
operation=self.operation, problem_sizes=problem_sizes,
A=tensor_As, B=tensor_Bs, C=tensor_Cs, D=tensor_Ds,
output_op=LinearCombinationFunctorArguments(alpha, beta)
)
self.operation.run(arguments)
arguments.sync()
#
# Reference check - TODO: support caching results
#
alpha = self.compute_type(alpha).value()
beta = self.compute_type(beta).value()
init_acc = self.accumulator_type(0).value()
for idx, problem_size in enumerate(problem_sizes):
if self.operation.switched:
tensor_ref_A = getTensorRef(
tensor_As[idx], problem_size, "a", transpose(self.operation.B.layout))
tensor_ref_B = getTensorRef(
tensor_Bs[idx], problem_size, "b", transpose(self.operation.A.layout))
tensor_ref_C = getTensorRef(
tensor_Cs[idx], problem_size, "c", transpose(self.operation.C.layout))
tensor_ref_D_ref = getTensorRef(
tensor_D_refs[idx], problem_size, "d", transpose(self.operation.C.layout))
else:
tensor_ref_A = getTensorRef(
tensor_As[idx], problem_size, "a", self.operation.A.layout)
tensor_ref_B = getTensorRef(
tensor_Bs[idx], problem_size, "b", self.operation.B.layout)
tensor_ref_C = getTensorRef(
tensor_Cs[idx], problem_size, "c", self.operation.C.layout)
tensor_ref_D_ref = getTensorRef(
tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
tensor_view_D_ref = getTensorView(
tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
tensor_view_D = getTensorView(
tensor_Ds[idx], problem_size, "d", self.operation.C.layout)
passed = cutlass.test.gemm.host.equals(
tensor_view_D, tensor_view_D_ref)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size)
del arguments
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
return passed

View File

@ -0,0 +1,557 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from time import sleep
import pycutlass
from pycutlass import *
import cutlass
from cuda import cudart
from cuda import cuda
from bfloat16 import bfloat16
from .profiler import GpuTimer
import subprocess
def transpose(layout):
if layout == cutlass.RowMajor:
return cutlass.ColumnMajor
elif layout == cutlass.ColumnMajor:
return cutlass.RowMajor
elif layout == cutlass.ColumnMajorInterleaved32:
return cutlass.RowMajorInterleaved32
elif layout == cutlass.RowMajorInterleaved32:
return cutlass.ColumnMajorInterleaved32
def getTensorRef(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: cutlass.layout):
ptr = tensor.__array_interface__['data'][0]
if operand == "a":
tensor_coord = problem_size.mk()
elif operand == "b":
tensor_coord = problem_size.kn()
elif operand in ["c", "d"]:
tensor_coord = problem_size.mn()
else:
raise ValueError("unknonw operand: " + operand)
if layout == cutlass.RowMajor:
layout = cutlass.RowMajor.packed(tensor_coord)
layout_tag = "RowMajor"
elif layout == cutlass.ColumnMajor:
layout = cutlass.ColumnMajor.packed(tensor_coord)
layout_tag = "ColumnMajor"
elif layout == cutlass.ColumnMajorInterleaved32:
layout = cutlass.ColumnMajorInterleaved32.packed(tensor_coord)
layout_tag = "ColumnMajorInterleaved32"
elif layout == cutlass.RowMajorInterleaved32:
layout = cutlass.RowMajorInterleaved32.packed(tensor_coord)
layout_tag = "RowMajorInterleaved32"
else:
raise ValueError("unsupported layout")
if tensor.dtype == np.float32:
ref_name = "TensorRefF32" + layout_tag
elif tensor.dtype == np.float64:
ref_name = "TensorRefF64" + layout_tag
elif tensor.dtype == np.float16:
ref_name = "TensorRefF16" + layout_tag
elif tensor.dtype == bfloat16:
ref_name = "TensorRefBF16" + layout_tag
elif tensor.dtype == np.int8:
ref_name = "TensorRefS8" + layout_tag
elif tensor.dtype == np.int32:
ref_name = "TensorRefS32" + layout_tag
else:
raise ValueError("unsupported datatype %s" %
ShortDataTypeNames[tensor.dtype])
return getattr(cutlass, ref_name)(ptr, layout)
def getTensorView(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: str):
tensor_ref = getTensorRef(tensor, problem_size, operand, layout)
if operand == "a":
tensor_coord = problem_size.mk()
elif operand == "b":
tensor_coord = problem_size.kn()
elif operand in ["c", "d"]:
tensor_coord = problem_size.mn()
else:
raise ValueError("unknonw operand: " + operand)
if layout == cutlass.RowMajor:
layout_tag = "RowMajor"
elif layout == cutlass.ColumnMajor:
layout_tag = "ColumnMajor"
elif layout == cutlass.ColumnMajorInterleaved32:
layout_tag = "ColumnMajorInterleaved32"
elif layout == cutlass.RowMajorInterleaved32:
layout_tag = "RowMajorInterleaved32"
else:
raise ValueError("unsupported layout")
if tensor.dtype == np.float32:
ref_name = "TensorViewF32" + layout_tag
elif tensor.dtype == np.float64:
ref_name = "TensorViewF64" + layout_tag
elif tensor.dtype == np.float16:
ref_name = "TensorViewF16" + layout_tag
elif tensor.dtype == bfloat16:
ref_name = "TensorViewBF16" + layout_tag
elif tensor.dtype == np.int32:
ref_name = "TensorViewS32" + layout_tag
elif tensor.dtype == np.int8:
ref_name = "TensorViewS8" + layout_tag
else:
raise ValueError("unsupported datatype")
return getattr(cutlass, ref_name)(tensor_ref, tensor_coord)
class GemmUniversalLauncher:
def __init__(self, operation: 'GemmOperationUniversal', seed: int = 2080, interleaved=False,
verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
# create the reduction kernel
self.reduction_operation: ReductionOperation = ReductionOperation(
shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
element_compute=operation.element_epilogue,
count=operation.C.alignment
)
self.math_operation = operation.tile_description.math_instruction.math_operation
#: verify the output result
self.verification = verification
#: profile the kernel's runtime
self.profiling = profiling
self.timer = GpuTimer()
self.warmup_iterations = warmup_iterations
self.iterations = iterations
if "sleep" in kwargs.keys():
self.sleep_time = kwargs["sleep"]
else:
self.sleep_time = 0
#
# Compile the operator
#
pycutlass.compiler.add_module([operation, self.reduction_operation])
self.operation = operation
self.dtype_A = GemmUniversalLauncher.numpy_type(operation.A.element)
self.dtype_B = GemmUniversalLauncher.numpy_type(operation.B.element)
self.dtype_C = GemmUniversalLauncher.numpy_type(operation.C.element)
self.dtype_D = GemmUniversalLauncher.numpy_type(operation.C.element)
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
element_size = DataTypeSize[operation.A.element]
if element_size == 1:
self.scope_max = 1
self.scope_min = 0
elif element_size <= 8:
self.scope_max = 1
self.scope_min = -1
elif element_size == 16:
self.scope_max = 4
self.scope_min = -4
else:
self.scope_max = 8
self.scope_min = -8
#: seed
self.seed: int = seed
#: whether the layout is interleaved
self.interleaved = interleaved
#: compute type
self.compute_type = operation.element_epilogue
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
def print_problem_size(self, p, mode, batch_count):
if mode == cutlass.gemm.Mode.Gemm:
mode = "Gemm"
elif mode == cutlass.gemm.Mode.GemmSplitKParallel:
mode = "GemmSplitKParalel"
problem_size = "problem: %d, %d, %d\n batch_count: %d\n mode: %s" % (
p.m(), p.n(), p.k(), batch_count, mode)
print(problem_size)
@staticmethod
def numpy_type(type):
if type == cutlass.float64:
return np.float64
elif type == cutlass.float32:
return np.float32
elif type == cutlass.float16:
return np.float16
elif type == cutlass.bfloat16:
return bfloat16
elif type == cutlass.int32:
return np.int32
elif type == cutlass.int8:
return np.int8
else:
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
def uniform_init(self, size, dtype):
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
return np.ceil(
np.random.uniform(
low=self.scope_min - 0.5, high=self.scope_max - 0.5,
size=size).astype(dtype)
)
else:
return np.random.uniform(
low=self.scope_min - 1, high=self.scope_max + 1,
size=size).astype(dtype)
def reorder_tensor_B(self, tensor_B, problem_size):
reordered_tensor_B = np.empty_like(tensor_B)
tensor_ref_B = getTensorRef(
tensor_B, problem_size, "b", self.operation.B.layout)
reordered_tensor_ref_B = getTensorRef(
reordered_tensor_B, problem_size, "b", self.operation.B.layout)
cutlass.gemm.host.reorder_column(
tensor_ref_B, reordered_tensor_ref_B, problem_size)
return reordered_tensor_B
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
# TODO
tensor_D_ref = np.ones_like(tensor_C)
alpha = self.numpy_type(self.compute_type)(alpha)
beta = self.numpy_type(self.compute_type)(beta)
init_acc = 0
alpha = self.compute_type(alpha).value()
beta = self.compute_type(beta).value()
init_acc = self.accumulator_type(init_acc).value()
if self.operation.switched:
tensor_ref_A = getTensorRef(
tensor_A, problem_size, "a", transpose(self.operation.B.layout))
tensor_ref_B = getTensorRef(
tensor_B, problem_size, "b", transpose(self.operation.A.layout))
tensor_ref_C = getTensorRef(
tensor_C, problem_size, "c", transpose(self.operation.C.layout))
tensor_ref_D_ref = getTensorRef(
tensor_D_ref, problem_size, "d", transpose(self.operation.C.layout))
else:
tensor_ref_A = getTensorRef(
tensor_A, problem_size, "a", self.operation.A.layout)
tensor_ref_B = getTensorRef(
tensor_B, problem_size, "b", self.operation.B.layout)
tensor_ref_C = getTensorRef(
tensor_C, problem_size, "c", self.operation.C.layout)
tensor_ref_D_ref = getTensorRef(
tensor_D_ref, problem_size, "d", self.operation.C.layout)
if self.math_operation in [MathOperation.multiply_add_saturate]:
cutlass.test.gemm.host.gemm_saturate(
problem_size, alpha, tensor_ref_A, tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
else:
cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
return tensor_D_ref
def equal(self, tensor_D, tensor_D_ref, problem_size):
tensor_view_D = getTensorView(
tensor_D, problem_size, "d", self.operation.C.layout)
tensor_view_D_ref = getTensorView(
tensor_D_ref, problem_size, "d", self.operation.C.layout)
return cutlass.test.gemm.host.equals(tensor_view_D, tensor_view_D_ref)
def bytes(self, problem_size, batch_count=1, alpha=1.0, beta=0.0):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
bytes = \
(DataTypeSize[self.operation.A.element] * m // 8) * k + \
(DataTypeSize[self.operation.B.element] * n // 8) * k + \
(DataTypeSize[self.operation.C.element] * m // 8) * n
if beta != 0:
bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
bytes *= batch_count
return bytes
def flops(self, problem_size, batch_count=1):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
flops_ = (m * n * k + m * n) * 2 * batch_count
# TODO: complex
return flops_
def run_cutlass_profiler(self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0):
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
values = {
"profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
"kernel_name": self.operation.procedural_name(),
"verification_providers": "device",
"provider": "cutlass",
"m": str(problem_size.m()),
"n": str(problem_size.n()),
"k": str(problem_size.k()),
'split_k_slices': str(batch_count),
'alpha': str(alpha),
'beta': str(beta),
'warmup': str(self.warmup_iterations),
'profile': str(self.iterations)
}
cmd_template = \
"${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
" --providers=${provider} --m=${m} --n=${n} --k=${k}"
cmd = SubstituteTemplate(cmd_template, values)
result = subprocess.getoutput(cmd)
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
runtime = float(m.group('runtime'))
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
bytes = int(m.group('bytes'))
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
flops = int(m.group('flops'))
# check if the problem size matches
assert bytes == self.bytes(problem_size, alpha, beta)
assert flops == self.flops(problem_size)
return runtime
def run(self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0):
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
np.random.seed(self.seed)
tensor_A = self.uniform_init(
size=(problem_size.m() * problem_size.k(),), dtype=self.dtype_A)
tensor_B = self.uniform_init(
size=(problem_size.n() * problem_size.k(),), dtype=self.dtype_B)
tensor_C = self.uniform_init(
size=(problem_size.m() * problem_size.n(),), dtype=self.dtype_C)
tensor_D = np.zeros(
shape=(problem_size.m() * problem_size.n(),), dtype=self.dtype_D)
#
# Launch kernel
#
arguments = GemmArguments(
operation=self.operation, problem_size=problem_size,
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
output_op=LinearCombinationFunctorArguments(alpha, beta),
gemm_mode=mode, split_k_slices=batch_count
)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
reduction_arguments = ReductionArguments(
self.reduction_operation, problem_size=[
problem_size.m(), problem_size.n()],
partitions=batch_count,
workspace=arguments.ptr_D,
destination=tensor_D,
source=tensor_C,
output_op=LinearCombinationFunctorArguments(alpha, beta)
)
self.operation.run(arguments)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
passed = True
if self.verification:
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
reduction_arguments.sync()
else:
arguments.sync()
tensor_D_ref = self.host_reference(
problem_size, tensor_A, tensor_B, tensor_C, alpha, beta)
passed = self.equal(tensor_D, tensor_D_ref, problem_size)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size, mode, batch_count)
if self.profiling:
sleep(self.sleep_time)
for _ in range(self.warmup_iterations):
self.operation.run(arguments)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
self.timer.start()
for _ in range(self.iterations):
self.operation.run(arguments)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
self.timer.stop_and_wait()
runtime = self.timer.duration(self.iterations)
# free memory and clear buffers
del arguments
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
del reduction_arguments
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
if self.profiling:
return runtime
return passed
def test_all_gemm(operation: 'GemmOperationUniversal', testcase="universal"):
passed = True
minimum_operand_element_size = min(
DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
opcode_class = operation.tile_description.math_instruction.opcode_class
if opcode_class == cutlass.OpClass.Simt:
alignment = 1
else:
alignment = 128 // minimum_operand_element_size
# int8_t gemm alignment constrainst
if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 and operation.A.layout == cutlass.ColumnMajor:
alignment_m = 4
else:
alignment_m = alignment
if opcode_class == cutlass.OpClass.Simt and operation.B.element == cutlass.int8 and operation.A.layout == cutlass.RowMajor:
alignment_n = 4
else:
alignment_n = alignment
if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 \
and operation.B.element == cutlass.int8 \
and (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor):
alignment_k = 4
else:
alignment_k = alignment
threadblock_k = operation.tile_description.threadblock_shape[2]
if testcase == "interleaved":
if operation.A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
interleavedk = 32
else:
raise ValueError("unknonw layout")
if testcase == "interleaved":
modes = [cutlass.gemm.Mode.Gemm, ]
problem_size_m = [interleavedk, 512+interleavedk]
problem_size_n = [interleavedk, 512+interleavedk]
problem_size_k = [interleavedk, threadblock_k *
operation.tile_description.stages + interleavedk]
problem_alpha = [1.0]
problem_beta = [0.0]
batch_counts = [1, ]
elif testcase == "multistage":
modes = [cutlass.gemm.Mode.Gemm, ]
problem_size_m = [16, 528]
problem_size_n = [16, 528]
problem_size_k = [threadblock_k, threadblock_k * operation.tile_description.stages +
operation.tile_description.math_instruction.instruction_shape[2]]
problem_alpha = [1.0]
problem_beta = [0.0]
batch_counts = [1, ]
else: # universal
modes = [cutlass.gemm.Mode.Gemm, cutlass.gemm.Mode.GemmSplitKParallel]
problem_size_m = [alignment_m, 512 - 3 * alignment_m]
problem_size_n = [alignment_n, 512 - 2 * alignment_n]
problem_size_k = [
alignment_k,
threadblock_k * operation.tile_description.stages - alignment_k,
threadblock_k * operation.tile_description.stages * 3 - alignment_k]
batch_counts = [1, 2, 3, 5, 7]
problem_alpha = [1.0]
problem_beta = [2.0]
testbed = GemmUniversalLauncher(
operation, interleaved=(testcase == "interleaved"))
for mode in modes:
for m in problem_size_m:
for n in problem_size_n:
for k in problem_size_k:
for batch_count in batch_counts:
for alpha in problem_alpha:
for beta in problem_beta:
# skip very small K problems
if testcase == "universal":
if (k // batch_count < 2 * threadblock_k):
continue
problem_size = cutlass.gemm.GemmCoord(m, n, k)
passed = testbed.run(
mode, problem_size, batch_count, alpha, beta)
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(
"CUDA Error %s" % str(err))
if not passed:
return False
return passed

View File

@ -0,0 +1,70 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from cuda import cuda
from cuda import cudart
class GpuTimer:
def __init__(self) -> None:
self.events = [
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
]
def start(self, stream=cuda.CUstream(0)):
err, = cuda.cuEventRecord(self.events[0], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
def stop(self, stream=cuda.CUstream(0)):
err, = cuda.cuEventRecord(self.events[1], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
pass
def stop_and_wait(self, stream=cuda.CUstream(0)):
self.stop(stream)
if stream:
err, = cuda.cuStreamSynchronize(stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
else:
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
def duration(self, iterations=1):
err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
return duration / float(iterations)

View File

@ -0,0 +1,39 @@
################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from typing import Union
from typeguard import typechecked
GemmOperation = 'Union[GemmOperationUniversal, GemmOperationGrouped]'
Tensor = 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]'

View File

@ -0,0 +1 @@
from pycutlass.utils.reference_model import *

View File

@ -0,0 +1,234 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import numpy as np
import cutlass
from pycutlass.library import TensorDescription
from typing import Union
try:
import torch
torch_available = True
except ImportError:
torch_available = False
class ReferenceModule:
def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription) -> None:
self.layout_A = A.layout
self.layout_B = B.layout
self.layout_C = C.layout
def run(self, A: np.ndarray, B: np.ndarray, C: np.ndarray, problem_size: cutlass.gemm.GemmCoord, alpha: float=1.0, beta: float=0.0):
"""
Compute the reference result on CPU
Args:
A: dense operator with shape (M, K) in row-major and (K, M) in column-major
B: dense operator with shape (K, N) in row-major and (N, K) in column-major
C: dense operator with shape (M, N) in row-major and (N, M) in column-major
"""
M, N, K = problem_size.m(), problem_size.n(), problem_size.k()
if isinstance(A, np.ndarray):
if self.layout_A == cutlass.RowMajor:
A_row = np.reshape(A, newshape=(M, K))
else:
A_col = np.reshape(A, newshape=(K, M))
A_row = np.transpose(A_col, axes=(1, 0))
if self.layout_B == cutlass.RowMajor:
B_row = np.reshape(B, newshape=(K, N))
else:
B_col = np.reshape(B, newshape=(N, K))
B_row = np.transpose(B_col, axes=(1, 0))
if self.layout_C == cutlass.RowMajor:
C_row = np.reshape(C, newshape=(M, N))
else:
C_col = np.reshape(C, newshape=(N, M))
C_row = np.transpose(C_col, axes=(1, 0))
out_row = np.matmul(A_row, B_row) * alpha + C_row * beta
if self.layout_C == cutlass.ColumnMajor:
out = np.transpose(out_row, axes=(1, 0))
else:
out = out_row
return out.ravel()
elif isinstance(A, torch.Tensor):
if self.layout_A == cutlass.RowMajor:
A_row = A.view((M, K))
else:
A_col = A.view((K, M))
A_row = torch.permute(A_col, (1, 0))
if self.layout_B == cutlass.RowMajor:
B_row = B.view((K, N))
else:
B_col = B.view((N, K))
B_row = torch.permute(B_col, (1, 0))
if self.layout_C == cutlass.RowMajor:
C_row = C.view((M, N))
else:
C_col = C.view((N, M))
C_row = torch.permute(C_col, (1, 0))
out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
if self.layout_C == cutlass.ColumnMajor:
out = torch.permute(out_row, (1, 0))
else:
out = out_row
return torch.flatten(out)
#####################################################################################################
# Conv2d
#####################################################################################################
if torch_available:
class Conv2dReferenceModule:
def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription, kind: cutlass.conv.Operator.fprop) -> None:
self.layout_A = A.layout
self.layout_B = B.layout
self.layout_C = C.layout
self.kind = kind
def run(self,
A: Union[np.ndarray, torch.Tensor],
B: Union[np.ndarray, torch.Tensor],
C: Union[np.ndarray, torch.Tensor], problem_size, alpha=1.0, beta=0.0) -> np.ndarray:
"""
Compute the reference result on CPU
"""
n = problem_size.N
h = problem_size.H
w = problem_size.W
c = problem_size.C
k = problem_size.K
r = problem_size.R
s = problem_size.S
p = problem_size.P
q = problem_size.Q
stride_h = problem_size.stride_h
stride_w = problem_size.stride_w
pad_h = problem_size.pad_h
pad_w = problem_size.pad_w
dilation_h = problem_size.dilation_h
dilation_w = problem_size.dilation_w
groups = problem_size.groups
if isinstance(A, np.ndarray):
# the pytorch activation layout is NCHW
# weight layout is Cout Cin Kh Kw (also NCHW)
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = np.reshape(A, newshape=(n, h, w, c))
A_torch_nhwc = torch.from_numpy(A_nhwc).to("cuda")
A_torch_nchw = torch.permute(A_torch_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = np.reshape(B, newshape=(k, r, s, c))
B_torch_nhwc = torch.from_numpy(B_nhwc).to("cuda")
B_torch_nchw = torch.permute(B_torch_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
C_nhwc = np.reshape(C, newshape=(n, p, q, k))
C_torch_nhwc = torch.from_numpy(C_nhwc).to("cuda")
C_torch_nchw = torch.permute(C_torch_nhwc, (0, 3, 1, 2))
elif isinstance(A, torch.Tensor):
if self.kind == cutlass.conv.Operator.wgrad:
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = A.view((n, p, q, k))
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = B.view((n, h, w, c))
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
C_nhwc = C.view((k, r, s, c))
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
elif self.kind == cutlass.conv.Operator.dgrad:
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = A.view((n, p, q, k))
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = B.view((k, r, s, c))
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
C_nhwc = C.view((n, h, w, c))
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
else:
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = A.view((n, h, w, c))
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = B.view((k, r, s, c))
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
C_nhwc = C.view((n, p, q, k))
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
if self.kind == cutlass.conv.Operator.fprop:
D_torch_nchw = alpha * torch.nn.functional.conv2d(
A_torch_nchw, B_torch_nchw, stride=(stride_h, stride_w),
padding=(pad_h, pad_w), dilation=(dilation_h, dilation_w), groups=groups) + beta * C_torch_nchw
elif self.kind == cutlass.conv.Operator.dgrad:
D_torch_nchw = alpha * torch.nn.grad.conv2d_input(
(n, c, h, w), B_torch_nchw, A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
).to(torch.float32) + beta * C_torch_nchw
elif self.kind == cutlass.conv.Operator.wgrad:
D_torch_nchw = alpha * torch.nn.grad.conv2d_weight(
B_torch_nchw, (k, c, r, s), A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
).to(torch.float32) + beta * C_torch_nchw
if self.layout_C == cutlass.TensorNHWC:
if isinstance(A, np.ndarray):
D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1)).detach().cpu().numpy()
elif isinstance(A, torch.Tensor):
D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1))
return D_torch_out.flatten()

View File

@ -0,0 +1,274 @@
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 2624928614 3423533117 3186342135
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 2732296888 1838622641 4203745561
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3456572634 893492926 1966259884
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 4014726279 4027869577 1510990157
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 4140605332 3580988556 3425909428
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2106553169 835800311 3417471222
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 860217059 166776702 1109666471
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 855244826 2670006594 3857976152
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 3079461262 3579256638 2926210806
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2952423142 2045838875 3445165841
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 2133381336 2601441527 2035094220
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 1700915522 2515933441 406719240
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 156533442 1012781676 688128904
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 3612826298 2531545294 476754549
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 2391975923 197605094 3409942185
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3071904063 408984565 2378809888
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 3067676760 1540919649 2008865071
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 1085505037 2778215386 230227569
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2731079464 3570839563 3483629877
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 408419601 3415600242 2106927195
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 3606099389 4034802752 3200055633
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 3910244699 1319285699 2229775542
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 2780071616 2703730845 3090625734
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 4278696824 360883914 3802692600
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 653419877 359675571 283806385
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 1075980921 3101013494 2025203940
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
conv2d fprop_1x8x8x1_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1883874274 1180207512 3934800419
conv2d fprop_1x16x16x1_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 4230587034 4117433929 2540623821
conv2d fprop_1x16x16x1_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 3802993432 1563447158 515257167
conv2d fprop_1x224x224x1_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2583340103 3928463259 1564251818
conv2d fprop_1x224x224x1_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2966178620 3457283045 1726663817
conv2d fprop_1x224x224x1_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 3101289788 3492498648
conv2d fprop_1x224x224x1_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 498358130 4111289929
conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2693144988 3876248534 3038023830 1910263513
conv2d fprop_1x16x16x2_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3355193355 319259163 535683577
conv2d fprop_1x16x16x2_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 1548147432 3385829172 2741952709
conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 2686562907 3948710179 3669872932
conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 576815792 2317227037 1211532666
conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 555460201 895685163
conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 1465341652 2228916523
conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 137535877 1436667267 1395660627
conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 2226159049 4051661898 209529384
conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 3541851870 2271016226 2671623385
conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 2007343215 3362992769
conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 20610297 1086800078
conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3117444553 1497663382 3561001103
conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 1414143072 827338392 2827855918
conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 3886996022 26545788 3407771964
conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 2374613655 3601677176
conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 778374730 2110111988
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3254575292 1119957081 672831271
conv2d fprop_1x4x4x14_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3115523958 3622905002 4020453928 3853387318
conv2d fprop_1x23x56x98_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1702870033 1876930844 1190400523 3937287850
conv2d fprop_1x4x4x28_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 2587856937 2021107274 2789519899
conv2d fprop_1x23x56x100_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2368669977 1353376771 744357395 786349633
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 991402150 1393431534 2496492611 3901723984
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4208297221 4283492776 3148637036 258220505
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4178596783 3828059710 281106520 1103939403
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 924522595 563724475 1938163814 2197809394
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1021044158 1686067905 350851834 3999808950
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 2674994719 1034822169 1611033520
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 4201252830 1597212204 2181492560
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 70289262 3001492060 1379239000
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 1288095320 4211138051 2804617605
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 2202157489 1043108884 2923122465
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2476454437 1857118302 3877008798 1206012078
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2767650699 3514840131 2946529611 3907056932
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3896287283 3112762669 1581171257 3959460786
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1903067870 1021832870 1926804094 1756790353
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3489785028 2466126497 1712378956 434322965
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2051350923 263676708 355203300 821870356
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 719099834 1474713672 2886387159 4086314983
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3441724486 3162593831 1422796372 2049419539
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2034354027 1249407570 1196036582 2684312264
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 1060050551
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 3361618746
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 172579142 319546523 2332616929 543467298
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2823351660 1326352711 3839068434 65031397
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3238446487 2572503545 3604065639 2111204111
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2149247508 1775375365 2663631601 1249487679
conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 403997062 1679063623 4062928786
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1092015789 3160693693 1526395881
conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 2236679600 3168985259
conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 3784328837 471971363
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 4106152802 2634710231 744755886
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 2709881923 2407415563
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 3723472741 3733128758 3129111191
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 2042513140 253288229 404121198
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1116254439 525487530 3284739065
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1743485155 91136873 2508716910
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 386662952 1127709182 4026285141
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 3954249564 2591894666 2655687700
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1263618595 1313664339
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1756414462 2995557277
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 447261065 121940906 1497499264
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 2966693627 1423016429 341928547
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1759979610 2761559427 68093525
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 2980501720 1650970502 3258883197
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 3502822733 3985958544 2568949300
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 3289288595 385631111 328914986
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 3391080565 1513955316 1521294163
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1669352457 2608107448 4284090805
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 1126870455 106232038 3054809396
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 4239438967
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 2113601884
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 2413490039 36034283 1112346965
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 1601750164 14375779 2894970748
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 1300976652 4259930640 305685205
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 1747587481 4137156526 1174257270
conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1086820986 1644914756 2013471312
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
conv2d wgrad_1x8x8x1_8x8_1x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 4278264698 2331753571 2554564568
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930

View File

@ -0,0 +1,187 @@
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,162 @@
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,89 @@
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
import pycutlass
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[4, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,86 @@
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,154 @@
# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass.test import *
import unittest
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 8, 8, channels),
cutlass.Tensor4DCoord(16, 3, 3, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 16, 16, channels),
cutlass.Tensor4DCoord(16, 3, 3, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 16, 16, channels),
cutlass.Tensor4DCoord(16, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(32, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=1)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=1)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=2,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,175 @@
# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass.test import *
import unittest
def conv2d_fixed_channel_problemsizes(channels):
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 8, 8, channels),
cutlass.Tensor4DCoord(16, 3, 3, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(32, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,291 @@
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 14),
cutlass.Tensor4DCoord(8, 3, 3, 14),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 23, 56, 98),
cutlass.Tensor4DCoord(128, 3, 3, 98),
cutlass.Tensor4DCoord(4, 0, 5, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 14),
cutlass.Tensor4DCoord(8, 3, 3, 14),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 23, 56, 98),
cutlass.Tensor4DCoord(128, 3, 3, 98),
cutlass.Tensor4DCoord(4, 0, 5, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 28),
cutlass.Tensor4DCoord(8, 3, 3, 28),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 23, 56, 100),
cutlass.Tensor4DCoord(128, 3, 3, 100),
cutlass.Tensor4DCoord(4, 0, 5, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,48 @@
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,87 @@
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
import pycutlass
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[4, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle2
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,98 @@
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
)
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,235 @@
# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64], stages=3,
warp_count=[2, 4, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 56, 56, 12),
cutlass.Tensor4DCoord(8, 1, 1, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 55, 55, 12),
cutlass.Tensor4DCoord(8, 1, 1, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,86 @@
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,224 @@
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[64, 256, 32], stages=3,
warp_count=[1, 4, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,87 @@
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
import pycutlass
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,98 @@
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=1)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=1)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst,
min_compute=80, max_compute=80
)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 8, 8, 1),
cutlass.Tensor4DCoord(1, 3, 3, 1),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,10 @@
import pycutlass
import unittest
from pycutlass.memory_manager import *
if __name__ == '__main__':
pycutlass.get_memory_pool(2**32, 2**32)
loader = unittest.TestLoader()
tests = loader.discover('./', 'conv2d_*.py')
testRunner = unittest.runner.TextTestRunner()
testRunner.run(tests)

View File

@ -0,0 +1 @@
CUPY_CACHE_DIR=./ python test_frontend.py

View File

@ -0,0 +1,136 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
## Test case for Pytorch
import pycutlass
import unittest
from pycutlass import *
import torch
import cupy as cp
class Test_Frontend(unittest.TestCase):
def setUp(self) -> None:
#
# define the cutlass operator
#
math_inst = MathInstruction(
[1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
cutlass.OpClass.Simt, MathOperation.multiply_add
)
tile_description = TileDescription(
[128, 128, 8], 4, [2, 4, 1],
math_inst, 80, 80
)
A = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
B = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
C = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
self.operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=cutlass.float32,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1
)
pycutlass.compiler.add_module([self.operation,])
def test_torch_frontend(self):
problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_D = torch.empty_like(tensor_C)
alpha = 1.0
beta = 0.0
arguments = GemmArguments(
operation=self.operation, problem_size=problem_size,
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
output_op=LinearCombinationFunctorArguments(alpha, beta),
gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
)
self.operation.run(arguments)
arguments.sync()
tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
self.assertTrue(torch.equal(tensor_D, tensor_D_ref))
def test_cupy_frontend(self):
cp.cuda.set_allocator(rmm.rmm_cupy_allocator)
problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
tensor_A = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.m(), problem_size.k()), dtype=cp.float32))
tensor_B = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.k(), problem_size.n()), dtype=cp.float32))
tensor_C = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.m(), problem_size.n()), dtype=cp.float32))
tensor_D = cp.ones_like(tensor_C)
alpha = 1.0
beta = 1.0
tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
arguments = GemmArguments(
operation=self.operation, problem_size=problem_size,
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
output_op=LinearCombinationFunctorArguments(alpha, beta),
gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
)
self.operation.run(arguments)
arguments.sync()
self.assertTrue(cp.array_equal(tensor_D, tensor_D_ref))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**32, 2**32)
unittest.main()

View File

@ -0,0 +1,93 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
from pycutlass.test.gemm_testbed import test_all_gemm
class GemmBF16TensorOpSm80(unittest.TestCase):
def SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32_64x128x64_32x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.bfloat16, element_b=cutlass.bfloat16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 128, 64],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.bfloat16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.bfloat16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.bfloat16, element_b=cutlass.bfloat16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 128, 32],
stages=6, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.bfloat16, layout=cutlass.RowMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.bfloat16, layout=cutlass.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.bfloat16, layout=cutlass.RowMajor,
alignment=8
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**24, 2**24)
unittest.main()

View File

@ -0,0 +1,425 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
from pycutlass.test.gemm_testbed import test_all_gemm
class GemmF16Sm80(unittest.TestCase):
def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.BatchedIdentitySwizzle
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
direct_store=True
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32_128x128x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64],
stages=3, warp_count=[2, 4, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_256x128x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[256, 128, 64],
stages=3, warp_count=[4, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 64, 64],
stages=3, warp_count=[2, 1, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float16
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32_64x64x32_32x32x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 32],
stages=10, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float16
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_256x128x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[256, 128, 64],
stages=3, warp_count=[4, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_test_SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 64, 64],
stages=3, warp_count=[2, 1, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64],
stages=3, warp_count=[2, 4, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64],
stages=3, warp_count=[2, 4, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**24, 2**24)
unittest.main()

View File

@ -0,0 +1,138 @@
import pycutlass
from pycutlass import *
from pycutlass.memory_manager import get_allocated_size
from pycutlass.test import *
import unittest
from pycutlass.test.gemm_testbed import test_all_gemm
class GemmF32nF32nF32nTensorOpF32Sm80(unittest.TestCase):
def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add_fast_bf16
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=4
)
B = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
B = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_fast_accurate_f32_64x64x32_32x32x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add_fast_f32
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
B = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**24, 2**24)
pycutlass.compiler.load_from_cache()
unittest.main()

View File

@ -0,0 +1,95 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
from pycutlass.test.gemm_testbed import test_all_gemm
class GemmF64TensorOpSm80(unittest.TestCase):
def test_SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64_32x32x16_16x16x16(self):
math_inst = MathInstruction(
instruction_shape=[8, 8, 4],
element_a=cutlass.float64, element_b=cutlass.float64,
element_accumulator=cutlass.float64, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[32, 32, 16],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
# alignment 1 restricted for double
A = TensorDescription(
element=cutlass.float64, layout=cutlass.ColumnMajor,
alignment=1
)
B = TensorDescription(
element=cutlass.float64, layout=cutlass.RowMajor,
alignment=1
)
C = TensorDescription(
element=cutlass.float64, layout=cutlass.RowMajor,
alignment=1
)
element_epilogue = cutlass.float64
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64_64x64x16_32x32x16(self):
math_inst = MathInstruction(
instruction_shape=[8, 8, 4],
element_a=cutlass.float64, element_b=cutlass.float64,
element_accumulator=cutlass.float64, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 16],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
# alignment 1 restricted for double
A = TensorDescription(
element=cutlass.float64, layout=cutlass.RowMajor,
alignment=1
)
B = TensorDescription(
element=cutlass.float64, layout=cutlass.ColumnMajor,
alignment=1
)
C = TensorDescription(
element=cutlass.float64, layout=cutlass.RowMajor,
alignment=1
)
element_epilogue = cutlass.float64
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**24, 2**24)
unittest.main()

View File

@ -0,0 +1,197 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
from pycutlass.test.gemm_grouped_testbed import TestbedGrouped
class GemmGroupedSm80(unittest.TestCase):
def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16], element_a=cutlass.float16,
element_b=cutlass.float16, element_accumulator=cutlass.float32,
opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
tile_description.minimum_compute_capability,
tile_description, A, B, C,
element_epilogue,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(24))
def test_SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64_64x64x16_32x32x16(self):
math_inst = MathInstruction(
instruction_shape=[8, 8, 4], element_a=cutlass.float64,
element_b=cutlass.float64, element_accumulator=cutlass.float64,
opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 16],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float64, layout=cutlass.RowMajor,
alignment=1
)
B = TensorDescription(
element=cutlass.float64, layout=cutlass.RowMajor,
alignment=1
)
C = TensorDescription(
element=cutlass.float64, layout=cutlass.ColumnMajor,
alignment=1
)
element_epilogue = cutlass.float64
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
tile_description.minimum_compute_capability,
tile_description, A, B, C,
element_epilogue,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(24))
def test_SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32_128x64x8_64x32x1(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1], element_a=cutlass.float32,
element_b=cutlass.float32, element_accumulator=cutlass.float32,
opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 64, 8],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=1
)
B = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=1
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.RowMajor,
alignment=1
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
tile_description.minimum_compute_capability,
tile_description, A, B, C,
element_epilogue,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(27))
def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32_cache(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16], element_a=cutlass.float16,
element_b=cutlass.float16, element_accumulator=cutlass.float32,
opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.LinearCombination
swizzling_functor = cutlass.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
tile_description.minimum_compute_capability,
tile_description, A, B, C,
element_epilogue,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(5))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -0,0 +1,219 @@
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
from pycutlass.test.gemm_testbed import test_all_gemm
class GemmS8TensorOpF32Sm80(unittest.TestCase):
def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_64x64x64_32x32x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass.int8, element_b=cutlass.int8,
element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add_saturate
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 64],
stages=6, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.int8, layout=cutlass.ColumnMajorInterleaved32,
alignment=16
)
B = TensorDescription(
element=cutlass.int8, layout=cutlass.RowMajorInterleaved32,
alignment=16
)
C = TensorDescription(
element=cutlass.int8, layout=cutlass.ColumnMajorInterleaved32,
alignment=8
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "interleaved"))
def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_256x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass.int8, element_b=cutlass.int8,
element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.int8, layout=cutlass.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass.int8, layout=cutlass.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass.int8, layout=cutlass.RowMajor,
alignment=16
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
def test_SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_128x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass.int8, element_b=cutlass.int8,
element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.int8, layout=cutlass.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass.int8, layout=cutlass.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass.int8, layout=cutlass.ColumnMajor,
alignment=16
)
element_epilogue = cutlass.float32
epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
def test_SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32_128x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass.int8, element_b=cutlass.int8,
element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.int8, layout=cutlass.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass.int8, layout=cutlass.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass.int32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.int32
epilogue_functor = EpilogueFunctor.LinearCombinationClamp
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
def test_SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32_128x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass.int8, element_b=cutlass.int8,
element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst, min_compute=80, max_compute=80
)
A = TensorDescription(
element=cutlass.int8, layout=cutlass.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass.int8, layout=cutlass.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass.int32, layout=cutlass.RowMajor,
alignment=4
)
element_epilogue = cutlass.int32
epilogue_functor = EpilogueFunctor.LinearCombinationClamp
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**24, 2**24)
unittest.main()

View File

@ -0,0 +1,9 @@
import pycutlass
import unittest
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
loader = unittest.TestLoader()
tests = loader.discover('./', 'gemm_*.py')
testRunner = unittest.runner.TextTestRunner()
testRunner.run(tests)

View File

@ -0,0 +1,350 @@
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3016005301 4142905842
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3337296764 4183699161 3654176452
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3852963969 864006170 920352568
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 2750240096 2120184232 2600672872
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3224082300 2084034673 3588056946
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 3033073939 304048758 1882633089
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 610026473 447427404 2639856195
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2818680871 58428273 3332443900
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 1891702153 103393067 2558647731
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 162127134 3567670201 3173514764
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 162127134 3567670201 363897018
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 1350938697 1696306119 1005311005
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3884703009 3552725366 1975514757 1210310496
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3393706648 3519979618 1149261202 799742106
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3409586999 409840186 1724648597 2642018980
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1815685330 1398622058 2431638856 1016967269
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2555706782 3271563943 1020153035 299097281
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4173830187 736684125 472021975 2064613035
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 2751224679 2250540122 3725638844
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 1583610315 3287895411 2394340435
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 2356047354 7055632 915702611
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2748205217 2539405983 1217377670 2011175578
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2748205217 2114448427 249997769 2711364520
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1528321643 1532777511 3597171412 296622236
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1326617037 3415095747 847196866 1481554158
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1122706355 2841974626 2791878604 632900093
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1728385278 2462678309 3066040807 1334515660
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2175275779 1117731224 857614711 2096711962
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4140401170 3710340185 1683575469 317397427
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3552249008 2918315307 2290683130 536859016
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2869959072 2516947012 3328285094 2393284712
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1349264322 1823945068 400087667 2893025864
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3321662203 426084311 4233055093 4078572279
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3321662203 426084311 4233055093 3044377475
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 803041205 2521863610 3206942690 127091020
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4083508736 37801570 240515127 2234797539
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2207374588 535059558 2268619394 1489214085
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 dnhwc_dnhwc_dnhwc_d_d 3614026280 1721563676 2979825951 1104908081
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 2226238626 2053372396 2462697514
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 235646718 1374133172 3696289981
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 184705847 3148323124 84213385
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2264868815 1724845245 3498302256 4094034457
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 233390337 1801952602 3532884734
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3048346885 2306163504 642074123 4083120683
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2798030672 683783039 3025345160 1890891136
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1731071506 1844675436 2292509333 4006304179
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 132147677 604503886 143348844 3037223953
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 1678940393 3405733837 1820114523
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 1678940393 3405733837 467254076
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1794301352 2320042028 2134048179 508141072
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 561590023 3382154048 4154621995 517057927
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 593915463 2360210889 2685491481 2265099675
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 2226238626 1155815529 558646991
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 1876429398 4216128545 1754596046
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 348523586 2609019785 3938405680 2601133907
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1984146316 1475870285 1157657800 1143965395
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2971058593 1478256319 503014742 3930504182
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 1214508920 1537003531 3830217225
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2031518387 2695641559 933408074 4026827730
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 517276344 1158854831 3123629043
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 517276344 1448394173 1864626308
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 711164468 2465036841 2993377049
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2264868815 3003481795 333430991 3094857755
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 1126010692 3313703859 637497110
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1130094757 2605103293 2477101661 1276123281
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4286533436 1302900889 2613245986 2523724148
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3048346885 923365529 1681226722 417509256
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2798030672 3441819646 1293178065 188472807
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1731071506 1117530547 2706270359 502156742
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 132147677 2029225588 3851064913 3164530726
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 2337137106 3312954197 2466682688
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 2337137106 3312954197 2684544683
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1794301352 72938921 2354994612 1463501392
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 252570564 2903451081 3619280116 1448586411
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2037991187 1665743881 241585763 103256264
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 dnhwc_dnhwc_dnhwc_d_d 2653975581 3337638999 1440125233 2448165745
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_f_f 2149247508 1775375365 3317647029 2497607448
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 927718585 4106152802 720400339 3989318043
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 3464637181 4051957661 126285749
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 832653836 3723472741 2044236350 2463899842
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2075083065 2042513140 3691286135 322550345
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4005590448 1116254439 2328237343 1918824440
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 181075276 1743485155 3526891198 1979405632
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1513864544 386662952 4057300775 1456746562
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 856324887 3954249564 2340393915 4127188930
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 1300426008 2921497047 4145791960
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 1300426008 4080981223 3076991942
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 832653836 447261065 3823545045 392205236
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3484040069 2966693627 3900095420 919511892
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1513864544 1759979610 4272621682 1029257940
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1906605830 2980501720 978889789 3136018973
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 805717279 3502822733 1810065278 1387739380
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 868180534 3289288595 209477462 4142168174
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3437976747 3391080565 97275649 4063718293
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4195072693 1669352457 2182133559 2494741804
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3457330201 1126870455 319272291 3811977088
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 754609939 1723074453 1660326213 3902884425
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 754609939 1723074453 1660326213 423159249
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1690216859 2413490039 223529410 3303697952
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3168796339 1601750164 1428743330 403295189
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 261954979 1300976652 2749562370 3058142403
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_f_f 3747142491 1747587481 3143977827 835130482
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 4172720592 446082987
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1101653138 3727072529 875733988
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3906526127 655926291 939844058
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 2031878085 1709408312 1277173429
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 22652410 1700696921 2175632852
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 436588210 470857851 284463232
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 59350507 969037229 1510558485
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 856797938 2030818524 4231831552
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 2885833872 2829967135 3441569557
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 4148824382 2827420298 378131261
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 4148824382 2827420298 2955292920
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 1474248671 1302526250 4182204885
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1569788048 162506176 819639712 763595635
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 945660191 3750377696 2496492611 3515056508
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2806300501 2591577756 3148637036 3845512743
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2322444122 3525997046 281106520 3456307300
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 327345109 1137297282 1938163814 2551101563
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 797067973 481331945 350851834 2477733239
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 2044204046 1034822169 3340281844
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 4174274001 1597212204 1881272946
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 1535088984 3001492060 2308505016
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3190527989 3733991924 4211138051 3710311115
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3190527989 3430768821 1043108884 4185640072
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 943531303 1948306075 3877008798 2803592376
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3262141476 4125717435 2946529611 2221512094
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1599291337 3982786366 1581171257 1188352423
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2237070215 3046262465 1926804094 1435916873
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 721666814 2012769306 1712378956 1388990183
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1596349869 3775131163 355203300 1126174452
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1380587417 1208642645 2886387159 3113955983
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1332573203 1417735573 1422796372 3309229181
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2714027800 2106992819 1196036582 2095126659
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1105097447 1992731268 2198911423 3378137735
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1105097447 1992731268 2198911423 3868431311
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2552471160 2218470296 2332616929 923645661
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2231354584 4035702005 3839068434 8981294
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 4019719318 3985307916 3604065639 277096636
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 bf16nhwc_bf16nhwc_fnhwc_f_f 258381429 3482776077 2663631601 593179089
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 188810648 1623218578 2585892217
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 691990354 3253144559 2988350639
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 1670375523 2425320272 2553108650
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1049321188 1865889553 3610888033 1459693945
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 3236781482 1382111427 1986396315
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 463742721 2524037630 3070473696 210045128
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 738614177 4071452982 3401957738 2920893800
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2479111539 2662555669 781892324 2338234282
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2089076160 260434096 1539389419 1219120658
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 3344412669 2885305868 1926445693
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 3344412669 2885305868 1478058549
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3945616248 4118489020 2885143346 1545684873
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 295760528 1685244361 3337423971 772814550
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 623727338 942771643 2634710231 3063349371
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 188810648 2709881923 3532383400
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 3762161398 3733128758 3693097785
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 139944998 3812563855 253288229 1359907535
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 492562992 3677108443 525487530 445191233
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 594197095 3773864559 91136873 4170763393
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 1025574686 1127709182 677727764
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1901075489 3296829308 2591894666 2932517926
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 4223561525 1263618595 50680160
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 4223561525 1756414462 3209752057
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 1023542180 121940906 624551470
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1049321188 296097075 1423016429 1058165639
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 4160685370 2761559427 1788182893
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1859384988 222880684 1650970502 1632078530
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1704522433 2403392926 3985958544 1432584676
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 463742721 3455033786 385631111 1683348880
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 738614177 3199562330 1513955316 2131256035
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2479111539 2702777753 2608107448 4014212857
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2089076160 4042009058 106232038 1140762595
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 2260768172 1186911503 3194129408
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 2260768172 1186911503 1312312812
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3945616248 2287161276 36034283 4262860382
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2906914535 476297538 14375779 1340176713
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 4292101959 3378414564 4259930640 1392755176
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 bf16nhwc_bf16nhwc_fnhwc_f_f 3529371817 368260304 4137156526 122558013
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 2948718568 2631391783 3260825675 4278587299
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 1635109696 2835574424 4179385325 2803281440
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 3344954627 1649157278 2032056735 1176638626
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 61750237 3452849177 1697665310 3475459781
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 1394759191 1571308277 898534533 4125341936
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 3402206912 2433594404 1575577431 4106154211
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 98638790 2735493952 346473870 1911666301
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 98638790 2735493952 346473870 2124440208
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 2934485636 3286257323 541566528 1113783492
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 164942943 4259285988 1250700182 508419908
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3805460372 2607401558 3465030781 210641751
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 4200926784 1001915027 387475271 3360115596
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 331078659 469730619 2547196469 1620698703
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 431968022 1614654085 903827412 1349891842
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3674369485 1055554271 3217013807 1356703347
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 3227824772 365527403 2720889763
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 2150996976 2899308770 2371758816
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 2124373651 2711906981 3194739760
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1070162100 2750964634 3090791018 3481982191
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1070162100 1563941622 767747438 3163252390
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 884815233 3576251756 3216742798 3534462723
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3230717758 3192193994 1161445944 371179683
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2450454245 2905280248 910194866 839083662
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2948718568 2631391783 638794727 4292051282
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1635109696 2835574424 1855687620 130932480
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3344954627 1649157278 4191418350 958044197
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 61750237 3452849177 3260472389 771128506
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1394759191 1571308277 4279538191 956191103
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3402206912 2433594404 2021112123 2983097553
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 98638790 2735493952 3178839372 568554158
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 98638790 2735493952 3178839372 18194802
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2934485636 3286257323 2559221535 2310182528
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 164942943 4259285988 984016853 888753301
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2823094147 1681845497 4242738907 3244428635
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 s8nhwc_s8nhwc_inhwc_i_i 4060010502 2881035321 3927119619 3311661122

View File

@ -0,0 +1,440 @@
#################################################################################################
#
# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
## Test case generator for SM80
import pycutlass
from pycutlass import *
from pycutlass.test import *
import unittest
#
# Create GEMM operation
#
def TestGemmOperator(gemm_kind, math_inst, layout, alignment, tiling, arch, mixed=False,
epilogue_functor = EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1, **kwargs):
"""
Test GEMM Operation based on configuration
"""
if "data_type" in kwargs.keys():
data_type = kwargs["data_type"]
else:
if mixed or math_inst.element_a == cutlass.bfloat16:
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator
]
else:
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator
]
tile_description = TileDescription(
tiling[0], tiling[1], tiling[2],
math_inst, arch, arch
)
A = TensorDescription(
data_type[0], layout[0], alignment[0]
)
B = TensorDescription(
data_type[1], layout[1], alignment[1]
)
C = TensorDescription(
data_type[2], layout[2], alignment[2]
)
element_epilogue = data_type[3]
if gemm_kind == GemmKind.Universal:
operation = GemmOperationUniversal(
arch=arch, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
if A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
return test_all_gemm(operation, "interleaved")
else:
return test_all_gemm(operation, "universal")
elif gemm_kind == GemmKind.Grouped:
operation = GemmOperationGrouped(
arch, tile_description, A, B, C,
element_epilogue, epilogue_functor, swizzling_functor,
precompute_mode=kwargs["precompute_mode"]
)
testbed = TestbedGrouped(operation=operation)
return testbed.run(24)
else:
raise NotImplementedError("the gemm kind is not implemented")
def TestConv2dOperator(math_inst, alignment, tiling, arch,
stride_supports=[StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided],
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=cutlass.IdentitySwizzle1, interleaved=False, **kwargs):
"""
Test Conv2d Operation based on configurations
"""
mixeds = [False, True, False]
conv_kinds = [cutlass.conv.Operator.fprop, cutlass.conv.Operator.dgrad, cutlass.conv.Operator.wgrad]
results = []
default_swizzling_functor = swizzling_functor
if "layout" in kwargs.keys():
layout = kwargs["layout"]
else:
layout = (cutlass.TensorNHWC, cutlass.TensorNHWC, cutlass.TensorNHWC)
for mixed, conv_kind, stride_support in zip(mixeds, conv_kinds, stride_supports):
if "data_type" in kwargs.keys():
data_type = kwargs["data_type"]
else:
if mixed or math_inst.element_a == cutlass.bfloat16:
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator
]
else:
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator
]
# skip Int8 Conv Backward
if data_type[0] == cutlass.int8 and conv_kind in [cutlass.conv.Operator.dgrad, cutlass.conv.Operator.wgrad]:
continue
A = TensorDescription(
element=data_type[0],
layout=layout[0],
alignment=alignment[0])
B = TensorDescription(
element=data_type[1],
layout=layout[1],
alignment=alignment[1])
C = TensorDescription(
element=data_type[2],
layout=layout[2],
alignment=alignment[2])
tile_description = TileDescription(
threadblock_shape=tiling[0], stages=tiling[1],
warp_count=tiling[2],
math_instruction=math_inst,
min_compute=arch, max_compute=arch
)
if conv_kind == cutlass.conv.Operator.dgrad and stride_support == StrideSupport.Strided:
swizzling_functor = cutlass.StridedDgradIdentitySwizzle1
else:
swizzling_functor = default_swizzling_functor
operation = Conv2dOperation(
conv_kind=conv_kind, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=arch, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=data_type[3], stride_support=stride_support,
epilogue_functor=epilogue_functor,
swizzling_functor=swizzling_functor
)
results.append(test_all_conv2d(operation, interleaved=interleaved))
return results
class Test_SM80(unittest.TestCase):
def test_SM80_TensorOp_16816(self):
math_instructions = [
MathInstruction(
[16, 8, 16], cutlass.float16, cutlass.float16, cutlass.float32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add
),
MathInstruction(
[16, 8, 16], cutlass.float16, cutlass.float16, cutlass.float16,
cutlass.OpClass.TensorOp, MathOperation.multiply_add
),
MathInstruction(
[16, 8, 16], cutlass.bfloat16, cutlass.bfloat16, cutlass.float32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add
)
]
layouts = [
(cutlass.RowMajor, cutlass.RowMajor, cutlass.RowMajor),
(cutlass.ColumnMajor, cutlass.RowMajor, cutlass.RowMajor),
(cutlass.RowMajor, cutlass.ColumnMajor, cutlass.RowMajor)
]
alignments = [
(8, 8, 8), (4, 8, 8), (8, 4, 8)
]
tilings = [
([256, 128, 32], 3, [4, 2, 1]),
([64, 256, 32], 4, [1, 4, 1]),
([128, 64, 64], 3, [2, 2, 1])
]
for math_inst, layout, alignment, tiling in zip(math_instructions, layouts, alignments, tilings):
self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False))
self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Host))
stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports)
for res in results:
self.assertTrue(res)
def test_SM80_TensorOp_1688(self):
# tf32 is not supported by most of python environment. Skip the test
self.assertTrue(True)
def test_SM80_TensorOp_1688_fast_math(self):
math_instructions = [
MathInstruction(
[16, 8, 8], cutlass.tfloat32, cutlass.tfloat32, cutlass.float32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add
),
MathInstruction(
[16, 8, 8], cutlass.float16, cutlass.float16, cutlass.float32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_f16
),
MathInstruction(
[16, 8, 8], cutlass.bfloat16, cutlass.bfloat16, cutlass.float32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_bf16
),
MathInstruction(
[16, 8, 8], cutlass.float32, cutlass.float32, cutlass.float32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_f32
)
]
layouts = [
(cutlass.RowMajor, cutlass.RowMajor, cutlass.ColumnMajor),
(cutlass.RowMajor, cutlass.ColumnMajor, cutlass.ColumnMajor),
(cutlass.ColumnMajor, cutlass.RowMajor, cutlass.ColumnMajor),
(cutlass.ColumnMajor, cutlass.ColumnMajor, cutlass.RowMajor)
]
alignments = [
(4, 4, 4), (4, 2, 4), (2, 4, 4), (2, 2, 4)
]
tilings = [
([128, 256, 16], 3, [4, 2, 1]),
([64, 256, 16], 4, [1, 4, 1]),
([128, 64, 32], 3, [2, 2, 1]),
([256, 64, 32], 3, [4, 2, 1])
]
data_type = [
cutlass.float32, cutlass.float32, cutlass.float32, cutlass.float32
]
for math_inst, layout, alignment, tiling in zip(math_instructions, layouts, alignments, tilings):
self.assertTrue(
TestGemmOperator(
GemmKind.Universal, math_inst, layout,
alignment, tiling, 80, False, data_type=data_type))
self.assertTrue(
TestGemmOperator(
GemmKind.Grouped, math_inst, layout, alignment, tiling, 80,
True, precompute_mode=SchedulerMode.Device, data_type=data_type))
stride_supports = [StrideSupport.Unity, StrideSupport.Strided, StrideSupport.Unity]
results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
for res in results:
self.assertTrue(res)
def test_SM80_TensorOp_884(self):
math_inst = MathInstruction(
[8, 8, 4], cutlass.float64, cutlass.float64, cutlass.float64,
cutlass.OpClass.TensorOp, MathOperation.multiply_add
)
layout = (cutlass.ColumnMajor, cutlass.ColumnMajor, cutlass.ColumnMajor)
alignment = (1, 1, 1)
tiling = ([64, 256, 16], 3, [2, 4, 1])
data_type = [cutlass.float64, cutlass.float64, cutlass.float64, cutlass.float64]
self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type))
stride_supports = [StrideSupport.Unity, StrideSupport.Strided, StrideSupport.Unity]
results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
for res in results:
self.assertTrue(res)
def test_SM80_TensorOp_16832_TN(self):
math_inst = MathInstruction(
[16, 8, 32], cutlass.int8, cutlass.int8, cutlass.int32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add_saturate
)
layout = (cutlass.RowMajor, cutlass.ColumnMajor, cutlass.ColumnMajor)
alignment = (16, 16, 4)
alignment_mixed = (16, 16, 16)
tiling = ([128, 256, 64], 3, [2, 4, 1])
data_type = [cutlass.int8, cutlass.int8, cutlass.int32, cutlass.int32]
data_type_mixed = [cutlass.int8, cutlass.int8, cutlass.int8, cutlass.float32]
self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment_mixed, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type_mixed))
stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
for res in results:
self.assertTrue(res)
def test_SM80_Simt_f32(self):
math_inst = MathInstruction(
[1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
cutlass.OpClass.Simt, MathOperation.multiply_add
)
layout = (cutlass.RowMajor, cutlass.RowMajor, cutlass.RowMajor)
alignment = (1, 1, 1)
tiling = ([128, 256, 8], 4, [2, 4, 1])
data_type = [cutlass.float32, cutlass.float32, cutlass.float32, cutlass.float32]
self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Host, data_type=data_type))
stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
for res in results:
self.assertTrue(res)
def test_SM80_Simt_f64(self):
math_inst = MathInstruction(
[1, 1, 1], cutlass.float64, cutlass.float64, cutlass.float64,
cutlass.OpClass.Simt, MathOperation.multiply_add
)
layout = (cutlass.RowMajor, cutlass.RowMajor, cutlass.ColumnMajor)
alignment = (1, 1, 1)
tiling = ([64, 128, 8], 5, [2, 2, 1])
data_type = [cutlass.float64, cutlass.float64, cutlass.float64, cutlass.float64]
self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type))
stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
for res in results:
self.assertTrue(res)
def test_SM80_TensorOp_16832_Interleaved(self):
math_inst = MathInstruction(
[16, 8, 32], cutlass.int8, cutlass.int8, cutlass.int32,
cutlass.OpClass.TensorOp, MathOperation.multiply_add_saturate
)
layout = (cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32, cutlass.ColumnMajorInterleaved32)
alignment_mixed = (16, 16, 8)
tiling = ([256, 64, 64], 4, [4, 1, 1])
data_type_mixed = [cutlass.int8, cutlass.int8, cutlass.int8, cutlass.float32]
self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment_mixed, tiling, 80, False, data_type=data_type_mixed, epilogue_functor=EpilogueFunctor.FastLinearCombinationClamp))
stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
layout = [cutlass.TensorNC32HW32, cutlass.TensorC32RSK32, cutlass.TensorNC32HW32]
results = TestConv2dOperator(math_inst, alignment_mixed, tiling, 80, stride_supports=stride_supports, data_type=data_type_mixed, layout=layout, interleaved=True)
for res in results:
self.assertTrue(res)
def SM80_SparseTensorOp_16832(self):
pass
def test_SM80_PlanarComplexTensorOp_16816(self):
pass
def test_SM80_SparseTensorOp_16816_fast_math(self):
pass
def test_SM80_TensorOp_1688_complex(self):
pass
def test_SM80_TensorOp_1688_fast_fp32_math_complex(self):
pass
def test_SM80_TensorOp_1688_rank_k(self):
pass
def test_SM80_TensorOp_1688_rank_k_complex(self):
pass
def test_SM80_TensorOp_1688_trmm(self):
pass
def test_SM80_TensorOp_1688_trmm_complex(self):
pass
def test_SM80_TensorOp_1688_symm(self):
pass
def test_SM80_TensorOp_1688_symm_complex(self):
pass
def test_SM80_TensorOp_884_complex(self):
pass
def test_SM80_TensorOp_884_complex_gaussian(self):
pass
def test_SM80_TensorOp_884_rank_k(self):
pass
def test_SM80_TensorOp_884_rank_k_complex(self):
pass
def test_SM80_TensorOp_884_rank_k_complex_gaussian(self):
pass
def test_SM80_TensorOp_884_trmm(self):
pass
def test_SM80_TensorOp_884_trmm_complex(self):
pass
def test_SM80_TensorOp_884_trmm_complex_gaussian(self):
pass
def test_SM80_TensorOp_884_symm(self):
pass
def test_SM80_TensorOp_884_symm_complex(self):
pass
def test_SM80_TensorOp_884_symm_complex_gaussian(self):
pass
def test_SM80_SparseTensorOp_16864_TN(self):
pass
def test_SM80_TensorOp_16864_TN(self):
pass
def test_SM80_SparseTensorOp_168128_TN(self):
pass
def test_SM80_TensorOp_16864_Interleaved(self):
pass
def test_SM80_TensorOp_168256(self):
pass
def test_SM80_Simt_complex(self):
pass
if __name__ == '__main__':
pycutlass.get_memory_pool(2**20, 2**34)
pycutlass.compiler.nvcc()
unittest.main()