Support for Mixed Input TensorOp (#1084)
* Passing warp-level mixed input F16*(S8/U8) tests * passing device-level mixed input F16*(S8/U8) tests * add to profiler - I8 (111 TFLOPs), U (123 TFLOPs) * fast numeric conversions (I8 = 132 TFLOPs, U8 = 148 TFLOPs) * Speedup reference compilation (REVERT THIS COMMIT) * wider_add.u32_packed_sub.f16x2 (I8 = 132TFLOP/s, U8 = 170 TFLOP/s) * Improve s8->f16 cvt and support bf16*u8 @158 TFLOPs * BF16 * S8 (142 TFLOPs) * Handle mixed-input upcast on OperandA (Support [S8|U8]*[F16|BF16] * rename OpMultiplyAddMixedInput to OpMultiplyAddMixedInputUpcast * Add device-level test and profiler support for upcast on operand A * Move shfl before the cvt and reduce #shfls by 1/2 * fix smem_usage calculation for mixed_input types * uncomment the stuff (getting ready for merge) * profiler changes and mixed-input reference * mixed input reference are in a new file * use platform instead of std * comments and typo only * Use CreateGemmOperator and delete CreateMixedInputGemmOperator * copyright for new files * rebase follow-up
This commit is contained in:
@ -103,11 +103,14 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
|
||||
for tile_description in tile_descriptions:
|
||||
for alignment in alignment_constraints:
|
||||
for complex_transform in complex_transforms:
|
||||
|
||||
# If alignment is a tuple or a list, then we have different alignments for A and B
|
||||
alignment_a = alignment if isinstance(alignment, int) else alignment[0]
|
||||
alignment_b = alignment if isinstance(alignment, int) else alignment[1]
|
||||
alignment_c = min(8, alignment_a)
|
||||
|
||||
alignment_c = min(8, alignment)
|
||||
|
||||
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
|
||||
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
|
||||
A = TensorDescription(element_a, layout[0], alignment_a, complex_transform[0])
|
||||
B = TensorDescription(element_b, layout[1], alignment_b, complex_transform[1])
|
||||
C = TensorDescription(element_c, layout[2], alignment_c)
|
||||
|
||||
new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
|
||||
@ -2150,6 +2153,116 @@ def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
|
||||
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
|
||||
data_type_mixed, alignment_constraints, complex_transforms)
|
||||
|
||||
|
||||
#
|
||||
def GenerateSM80_MixedInputTensorOp_16816(manifest, cuda_version):
|
||||
|
||||
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
||||
return
|
||||
|
||||
layouts = [
|
||||
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
||||
]
|
||||
|
||||
# Upcast on Operand A
|
||||
math_instructions = [
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.s8, DataType.f16, DataType.f16, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.s8, DataType.f16, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.u8, DataType.f16, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.u8, DataType.bf16, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.s8, DataType.bf16, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
]
|
||||
|
||||
min_cc = 80
|
||||
max_cc = 1024
|
||||
|
||||
# For mixed-input alignment constraints are a list of lists, where the inner list
|
||||
# contains the alignment constraints for [operandA, operandB].
|
||||
alignment_constraints = [[16, 8],]
|
||||
|
||||
for math_inst in math_instructions:
|
||||
tile_descriptions = [
|
||||
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
||||
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
||||
]
|
||||
|
||||
data_type = [
|
||||
math_inst.element_a,
|
||||
math_inst.element_b,
|
||||
math_inst.element_b,
|
||||
math_inst.element_accumulator,
|
||||
]
|
||||
|
||||
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
||||
data_type, alignment_constraints)
|
||||
|
||||
# Upcast on Operand B
|
||||
math_instructions = [
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.f16, DataType.s8, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.bf16, DataType.s8, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.f16, DataType.u8, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
MathInstruction( \
|
||||
[16, 8, 16], \
|
||||
DataType.bf16, DataType.u8, DataType.f32, \
|
||||
OpcodeClass.TensorOp, \
|
||||
MathOperation.multiply_add_mixed_input_upcast),
|
||||
]
|
||||
|
||||
min_cc = 80
|
||||
max_cc = 1024
|
||||
|
||||
# For mixed-input alignment constraints are a list of lists, where the inner list
|
||||
# contains the alignment constraints for [operandA, operandB].
|
||||
alignment_constraints = [[8, 16],]
|
||||
|
||||
for math_inst in math_instructions:
|
||||
tile_descriptions = [
|
||||
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
||||
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
||||
]
|
||||
|
||||
data_type = [
|
||||
math_inst.element_a,
|
||||
math_inst.element_b,
|
||||
math_inst.element_a,
|
||||
math_inst.element_accumulator,
|
||||
]
|
||||
|
||||
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
||||
data_type, alignment_constraints)
|
||||
|
||||
#
|
||||
def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
|
||||
|
||||
@ -4083,6 +4196,7 @@ def GenerateSM80(manifest, cuda_version):
|
||||
GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
|
||||
GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
|
||||
GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
|
||||
GenerateSM80_MixedInputTensorOp_16816(manifest, cuda_version)
|
||||
GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
|
||||
GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
|
||||
GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
|
||||
|
||||
Reference in New Issue
Block a user