Support for Mixed Input TensorOp (#1084)

* Passing warp-level mixed input F16*(S8/U8) tests

* passing device-level mixed input F16*(S8/U8) tests

* add to profiler - I8 (111 TFLOPs), U (123 TFLOPs)

* fast numeric conversions (I8 = 132 TFLOPs, U8 = 148 TFLOPs)

* Speedup reference compilation (REVERT THIS COMMIT)

* wider_add.u32_packed_sub.f16x2 (I8 = 132TFLOP/s, U8 = 170 TFLOP/s)

* Improve s8->f16 cvt and support bf16*u8 @158 TFLOPs

* BF16 * S8 (142 TFLOPs)

* Handle mixed-input upcast on OperandA (Support [S8|U8]*[F16|BF16]

* rename OpMultiplyAddMixedInput to OpMultiplyAddMixedInputUpcast

* Add device-level test and profiler support for upcast on operand A

* Move shfl before the cvt and reduce #shfls by 1/2

* fix smem_usage calculation for mixed_input types

* uncomment the stuff (getting ready for merge)

* profiler changes and mixed-input reference

* mixed input reference are in a new file

* use platform instead of std

* comments and typo only

* Use CreateGemmOperator and delete CreateMixedInputGemmOperator

* copyright for new files

* rebase follow-up
This commit is contained in:
Manish Gupta
2023-09-27 08:18:30 -07:00
committed by GitHub
parent 5cd735c48e
commit 7d8317a63e
26 changed files with 2064 additions and 13 deletions

View File

@ -62,6 +62,11 @@ class Conv2dOperation:
self.stride_support = stride_support
self.swizzling_functor = swizzling_functor
self.group_mode = group_mode
#
def is_mixed_input(self):
return self.A.element != self.B.element
#
def is_complex(self):
complex_operators = [

View File

@ -60,7 +60,11 @@ class Conv3dOperation:
self.iterator_algorithm = iterator_algorithm
self.stride_support = stride_support
self.swizzling_functor = swizzling_functor
#
def is_mixed_input(self):
return self.A.element != self.B.element
#
def core_name(self):
''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''

View File

@ -88,6 +88,10 @@ class GemmOperation:
]
return self.tile_description.math_instruction.math_operation in complex_operators
#
def is_mixed_input(self):
return self.A.element != self.B.element
#
def is_planar_complex(self):
return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray)
@ -149,14 +153,19 @@ class GemmOperation:
if self.C.element != self.tile_description.math_instruction.element_accumulator and \
self.A.element != self.tile_description.math_instruction.element_accumulator:
extended_name = "${element_c}_${core_name}_${element_a}"
if self.is_mixed_input():
extended_name += "_${element_b}"
elif self.C.element == self.tile_description.math_instruction.element_accumulator and \
self.A.element != self.tile_description.math_instruction.element_accumulator:
extended_name = "${core_name}_${element_a}"
if self.is_mixed_input():
extended_name += "_${element_b}"
else:
extended_name = "${core_name}"
extended_name = SubstituteTemplate(extended_name, {
'element_a': DataTypeNames[self.A.element],
'element_b': DataTypeNames[self.B.element],
'element_c': DataTypeNames[self.C.element],
'core_name': self.core_name()
})
@ -235,7 +244,7 @@ class GemmOperation:
ex = self.extended_name(),
tb = threadblock,
l = self.layout_name(),
a = str(self.A.alignment))
a = str(max(self.A.alignment, self.B.alignment)))
#
def configuration_name(self):

View File

@ -103,11 +103,14 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
for tile_description in tile_descriptions:
for alignment in alignment_constraints:
for complex_transform in complex_transforms:
# If alignment is a tuple or a list, then we have different alignments for A and B
alignment_a = alignment if isinstance(alignment, int) else alignment[0]
alignment_b = alignment if isinstance(alignment, int) else alignment[1]
alignment_c = min(8, alignment_a)
alignment_c = min(8, alignment)
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
A = TensorDescription(element_a, layout[0], alignment_a, complex_transform[0])
B = TensorDescription(element_b, layout[1], alignment_b, complex_transform[1])
C = TensorDescription(element_c, layout[2], alignment_c)
new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
@ -2150,6 +2153,116 @@ def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
data_type_mixed, alignment_constraints, complex_transforms)
#
def GenerateSM80_MixedInputTensorOp_16816(manifest, cuda_version):
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
return
layouts = [
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
]
# Upcast on Operand A
math_instructions = [
MathInstruction( \
[16, 8, 16], \
DataType.s8, DataType.f16, DataType.f16, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.s8, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.u8, DataType.f16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.u8, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.s8, DataType.bf16, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
]
min_cc = 80
max_cc = 1024
# For mixed-input alignment constraints are a list of lists, where the inner list
# contains the alignment constraints for [operandA, operandB].
alignment_constraints = [[16, 8],]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_b,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
# Upcast on Operand B
math_instructions = [
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.s8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.bf16, DataType.s8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.f16, DataType.u8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
MathInstruction( \
[16, 8, 16], \
DataType.bf16, DataType.u8, DataType.f32, \
OpcodeClass.TensorOp, \
MathOperation.multiply_add_mixed_input_upcast),
]
min_cc = 80
max_cc = 1024
# For mixed-input alignment constraints are a list of lists, where the inner list
# contains the alignment constraints for [operandA, operandB].
alignment_constraints = [[8, 16],]
for math_inst in math_instructions:
tile_descriptions = [
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
]
data_type = [
math_inst.element_a,
math_inst.element_b,
math_inst.element_a,
math_inst.element_accumulator,
]
CreateGemmOperator(manifest, layouts, tile_descriptions, \
data_type, alignment_constraints)
#
def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
@ -4083,6 +4196,7 @@ def GenerateSM80(manifest, cuda_version):
GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
GenerateSM80_MixedInputTensorOp_16816(manifest, cuda_version)
GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)

View File

@ -289,6 +289,7 @@ class ComplexMultiplyOp(enum.Enum):
class MathOperation(enum.Enum):
multiply_add = enum_auto()
multiply_add_saturate = enum_auto()
multiply_add_mixed_input_upcast = enum_auto()
xor_popc = enum_auto()
and_popc = enum_auto()
multiply_add_fast_bf16 = enum_auto()
@ -302,6 +303,7 @@ class MathOperation(enum.Enum):
MathOperationTag = {
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
MathOperation.multiply_add_mixed_input_upcast: 'cutlass::arch::OpMultiplyAddMixedInputUpcast',
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
MathOperation.and_popc: 'cutlass::arch::OpAndPopc',
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
@ -964,8 +966,13 @@ def CalculateSmemUsage(operation):
cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
else:
# Few BLAS3 operations only have A tensor
smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * cta_shape[2] // 8 + \
DataTypeSize[operation.A.element] * cta_shape[1] * cta_shape[2] // 8
data_type_size_a = DataTypeSize[operation.A.element]
data_type_size_b = DataTypeSize[operation.A.element]
if operation.is_mixed_input():
data_type_size_b = DataTypeSize[operation.B.element]
smem_per_stage = data_type_size_a * cta_shape[0] * cta_shape[2] // 8 + \
data_type_size_b * cta_shape[1] * cta_shape[2] // 8
smem_usage = smem_per_stage * stages
return (smem_usage >> 10)

View File

@ -79,6 +79,10 @@ class Rank2KOperation:
return self.tile_description.math_instruction.math_operation in complex_operators
return False
#
def is_mixed_input(self):
return self.A.element != self.B.element
#
def is_planar_complex(self):
return False

View File

@ -77,6 +77,10 @@ class RankKOperation:
return self.tile_description.math_instruction.math_operation in complex_operators
return False
#
def is_mixed_input(self):
return False
#
def is_planar_complex(self):
return False

View File

@ -79,6 +79,10 @@ class SymmOperation:
return self.tile_description.math_instruction.math_operation in complex_operators
return False
#
def is_mixed_input(self):
return self.A.element != self.B.element
#
def is_planar_complex(self):
return False

View File

@ -81,6 +81,10 @@ class TrmmOperation:
# return self.trmm_kind in (TrmmKind.PlanarComplex, TrmmKind.PlanarComplexArray)
return False
#
def is_mixed_input(self):
return self.A.element != self.B.element
#
def accumulator_type(self):
accum = self.tile_description.math_instruction.element_accumulator