releaase 2.11 (#703)

This commit is contained in:
Aditya Atluri
2022-11-19 06:02:15 -08:00
committed by GitHub
parent 3c90f6aea6
commit c975e2ccbb
329 changed files with 47332 additions and 10607 deletions

View File

@ -11,6 +11,8 @@ import argparse
from library import *
from manifest import *
from itertools import product
###################################################################################################
#
@ -49,6 +51,8 @@ def EpilogueAlignment(max_alignment, tile, epilogue_steps = 8):
def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
swizzling_functor = SwizzlingFunctor.Identity8):
# Use StreamK decomposition for basic GEMMs
# swizzling_functor = SwizzlingFunctor.StreamK):
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
@ -373,11 +377,26 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
# Strided support for Analytic and Optimized Fprop
for iterator_algorithm in iterator_algorithms:
new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
manifest.append(new_operation)
operations.append(new_operation)
new_operations = [
# None grouped kernel
Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_),
]
# Instance group conv kernel
if tile.math_instruction.opcode_class == OpcodeClass.TensorOp and A.layout == LayoutType.TensorNHWC:
# SingleGroup kernel
new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_, group_mode=GroupMode.SingleGroup))
# Analytic iterator supports MultipleGroup mode
if iterator_algorithm == IteratorAlgorithm.Analytic:
new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_, group_mode=GroupMode.MultipleGroup))
for new_operation in new_operations:
manifest.append(new_operation)
operations.append(new_operation)
#
# Conv2d Dgrad
@ -593,6 +612,62 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
return operations
# Convolution for Depthwise 2d conv
def CreateDepthwiseConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
element_a, element_b, element_c, element_epilogue = data_type
# iterator algorithm (FixedStrideDilation, Optimized)
iterator_algorithms = [IteratorAlgorithm.FixedStrideDilation, IteratorAlgorithm.Optimized]
# by default, only generate the largest tile size, largest alignment, and optimized iterator
if manifest.kernel_filter == '':
tile_descriptions = [tile_descriptions[0],]
alignment_constraints = [alignment_constraints[0],]
operations = []
for tile in tile_descriptions:
for alignment in alignment_constraints:
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment)
B = TensorDescription(element_b, layout[1], alignment)
C = TensorDescription(element_c, layout[2], alignment_c)
swizzling_functor_ = swizzling_functor
if ConvKind.Fprop in conv_kinds:
# Strided support for Optimized and FixedStridedDilation Depthwise Conv
for iterator_algorithm in iterator_algorithms:
stride_support = StrideSupport.Strided
if iterator_algorithm == IteratorAlgorithm.FixedStrideDilation:
if tile.stride == [-1, -1] or tile.dilation == [-1,-1]:
continue
stride_support = StrideSupport.Fixed
if iterator_algorithm == IteratorAlgorithm.Optimized:
if tile.stride != [-1, -1] or tile.dilation != [-1,-1]:
continue
new_operation = Conv2dOperation(ConvKind.Fprop,
iterator_algorithm,
tile.minimum_compute_capability,
tile,
A, B, C,
element_epilogue,
stride_support,
epilogue_functor,
swizzling_functor_,
group_mode=GroupMode.Depthwise)
manifest.append(new_operation)
operations.append(new_operation)
return operations
###################################################################################################
###################################################################################################
@ -748,10 +823,83 @@ def GenerateSM60_Simt(manifest, cuda_version):
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
def GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version):
math_instructions = [
MathInstruction( \
[1, 1, 1], \
DataType.f16, DataType.f16, DataType.f16, \
OpcodeClass.Simt, \
MathOperation.multiply_add),
]
min_cc = 60
max_cc = 1024
alignment_constraints = [8,]
filter_3x3 = [3, 3]
filter_5x5 = [5, 5]
# [stride_h, stride_w]
# [-1, -1] means all stride size.
strides = [[-1,-1], [1, 1], [2, 2]]
# [dilation_h, dilation_w]
# [-1, -1] means all dilation size.
dilations = [[-1,-1], [1, 1], [2, 2]]
#groups per thread block
g16 = 16
g32 = 32
g64 = 64
#output shape per thread block
npq_1x4x4 = [1, 4, 4]
npq_1x8x8 = [1, 8, 8]
npq_1x10x10 = [1, 10, 10]
tile_descriptions = []
for math_inst in math_instructions:
for stride, dilation in product(strides, dilations):
tile_descriptions.extend([
# filter3x3 ThreadBlock_output, filter, stage, warp
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_3x3, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_3x3, 4, stride, dilation, [4, 1, 1], math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_3x3, 4, stride, dilation,[4, 1, 1], math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_3x3, 4, stride, dilation, [4, 1, 1], math_inst, min_cc, max_cc),
# filter5x5 ThreadBlock_output, filter, stage, warp
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_5x5, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc)
])
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_accumulator,
math_inst.element_accumulator,
]
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
CreateDepthwiseConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
#
#
def GenerateSM60(manifest, cuda_version):
GenerateSM60_Simt(manifest, cuda_version)
GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version)
###################################################################################################
###################################################################################################
@ -3813,6 +3961,627 @@ def GenerateSM80(manifest, cuda_version):
GenerateSM80_Simt_complex(manifest, cuda_version)
###################################################################################################
#
def GenerateSM90_TensorOp_1684(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM90_TensorOp_1684_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
(ComplexTransform.none, ComplexTransform.none),
(ComplexTransform.conj, ComplexTransform.none),
(ComplexTransform.none, ComplexTransform.conj),
(ComplexTransform.conj, ComplexTransform.conj)
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64]
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
# SYRK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HERK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor),
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [ComplexTransform.none,]
# SYRK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HERK computation
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints)
#
#
def GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
ComplexTransform.none, ComplexTransform.conj,
]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
diag_types = [
DiagType.NonUnit, DiagType.Unit,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [
ComplexTransform.none, ComplexTransform.conj,
]
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
data_type, alignment_constraints, complex_transforms)
#
#
def GenerateSM90_TensorOp_1684_symm(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
#
#
def GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
# SYMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HEMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
#
def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
return
layouts = [
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
side_modes = [
SideMode.Left, SideMode.Right,
]
fill_modes = [
FillMode.Lower, FillMode.Upper,
]
math_inst = \
MathInstruction( \
[16, 8, 4], \
DataType.f64, DataType.f64, DataType.f64, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_complex_gaussian)
min_cc = 90
max_cc = 1024
alignment_constraints = [1,]
tile_descriptions = [
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
]
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
complex_transforms = [ComplexTransform.none,]
# SYMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.symmetric)
# HEMM computation
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
data_type, alignment_constraints, BlasMode.hermitian)
#
###################################################################################################
#
def GenerateSM90(manifest, cuda_version):
GenerateSM90_TensorOp_1684(manifest, cuda_version)
GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version)
GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version)
GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version)
GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version)
GenerateSM90_TensorOp_1684_symm(manifest, cuda_version)
GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version)
###################################################################################################
if __name__ == "__main__":
@ -3842,6 +4611,8 @@ if __name__ == "__main__":
GenerateSM70(manifest, args.cuda_version)
GenerateSM75(manifest, args.cuda_version)
GenerateSM80(manifest, args.cuda_version)
GenerateSM90(manifest, args.cuda_version)
if 'library' in args.generator_target.split(','):
manifest.emit(GeneratorTarget.Library)