v4.2 release. (#2587)
* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line. * v4.2 release.
This commit is contained in:
@ -90,10 +90,12 @@ try:
|
||||
raise ImportError("Disabling attempt to import cutlass_library")
|
||||
from cutlass_library.library import *
|
||||
from cutlass_library.manifest import *
|
||||
from cutlass_library.heuristics import *
|
||||
from cutlass_library.emit_kernel_listing import emit_gemm_kernel_testlist
|
||||
except ImportError:
|
||||
from library import *
|
||||
from manifest import *
|
||||
from heuristics import *
|
||||
from emit_kernel_listing import emit_gemm_kernel_testlist
|
||||
###################################################################################################
|
||||
|
||||
@ -112,6 +114,10 @@ def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
|
||||
cuda_version.append(x)
|
||||
return cuda_version >= [major, minor, patch]
|
||||
|
||||
# From cuda 13.0, Thor SM is renumbered from 101 to 110
|
||||
def ThorSMRenumbering(cuda_version):
|
||||
return 110 if CudaToolkitVersionSatisfies(cuda_version, 13, 0) else 101
|
||||
|
||||
###################################################################################################
|
||||
###################################################################################################
|
||||
|
||||
@ -6768,9 +6774,11 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
|
||||
},
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
math_instructions_1sm = [
|
||||
# tf32 -> f32
|
||||
MathInstruction(
|
||||
@ -6887,7 +6895,8 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
|
||||
[[LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.RowMajor, 0]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
grouped = is_grouped(gemm_kind)
|
||||
@ -7202,9 +7211,11 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK
|
||||
[[LayoutType.RowMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 0]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
epi_type = DataType.f32
|
||||
grouped = is_grouped(gemm_kind)
|
||||
|
||||
@ -7889,9 +7900,11 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
|
||||
TileSchedulerType.Default, TileSchedulerType.StreamK
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
epi_type = DataType.f32
|
||||
|
||||
math_instructions_1sm = []
|
||||
@ -8092,6 +8105,8 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
|
||||
if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
|
||||
return
|
||||
|
||||
grouped = is_grouped(gemm_kind)
|
||||
|
||||
layouts = [
|
||||
[[LayoutType.RowMajor, 128], [LayoutType.ColumnMajor, 128], [LayoutType.RowMajor, 0]],
|
||||
[[LayoutType.ColumnMajor, 128], [LayoutType.RowMajor, 128], [LayoutType.RowMajor, 0]],
|
||||
@ -8120,14 +8135,16 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
|
||||
def tile_schedulers(sfdtype):
|
||||
# Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
|
||||
# the epilogue is the traditional linear combination, for which we already have tests with stream-K.
|
||||
if sfdtype["type"] == DataType.void:
|
||||
if sfdtype["type"] == DataType.void or grouped:
|
||||
return [TileSchedulerType.Default]
|
||||
else:
|
||||
return [TileSchedulerType.Default, TileSchedulerType.StreamK]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
epi_type = DataType.f32
|
||||
|
||||
math_instructions_1sm = []
|
||||
@ -8209,6 +8226,16 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.void,
|
||||
"d_type" : DataType.bf16,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
@ -8246,7 +8273,7 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
|
||||
|
||||
for data_type in data_types:
|
||||
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
||||
[[KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]]
|
||||
[[to_grouped_schedule(KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100, grouped), to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)]]
|
||||
, tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
|
||||
|
||||
cluster_shapes_2sm = [
|
||||
@ -8288,6 +8315,16 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.void,
|
||||
"d_type" : DataType.bf16,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
@ -8346,7 +8383,11 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
|
||||
if tile.threadblock_shape[2] // tile.cluster_shape[2] % layout[1][1] != 0:
|
||||
continue
|
||||
|
||||
if math_inst.instruction_shape[0] == 128:
|
||||
if grouped:
|
||||
CreateGemmUniversal3xOperator(manifest, [layout], [tile], [data_type],
|
||||
[[to_grouped_schedule(KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100, grouped), to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped)]]
|
||||
, tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
|
||||
elif math_inst.instruction_shape[0] == 128:
|
||||
CreateGemmUniversal3xOperator(manifest, [layout], [tile], [data_type],
|
||||
[[KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]]
|
||||
, tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
|
||||
@ -8396,9 +8437,11 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
|
||||
else:
|
||||
return [TileSchedulerType.Default, TileSchedulerType.StreamK]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
epi_type = DataType.f32
|
||||
|
||||
math_instructions_1sm = []
|
||||
@ -8496,6 +8539,16 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.bf16,
|
||||
"d_type" : DataType.bf16,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
@ -8625,6 +8678,16 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.bf16,
|
||||
"d_type" : DataType.bf16,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
@ -8715,6 +8778,230 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
|
||||
|
||||
|
||||
|
||||
def GenerateSM103_TensorOp_fp4_ultra_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.BlockScaledUniversal3x):
|
||||
# SM100 MMA with F4 + block scale
|
||||
if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
|
||||
return
|
||||
|
||||
# layouts for ABC and their alignments.
|
||||
layouts = [
|
||||
[[LayoutType.RowMajor, 32], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor, 0]],
|
||||
]
|
||||
|
||||
instruction_sizes_1sm = [
|
||||
[128, 128, 96],
|
||||
]
|
||||
|
||||
instruction_sizes_2sm = [
|
||||
[256, 128, 96],
|
||||
]
|
||||
|
||||
ab_types = [
|
||||
DataType.e2m1,
|
||||
]
|
||||
|
||||
acc_types = [ DataType.f32 ] # Accumulator is always 32 bits for block scaled MMA instructions
|
||||
|
||||
min_cc = 103
|
||||
max_cc = 103
|
||||
epi_type = DataType.f32
|
||||
|
||||
math_instructions_1sm = []
|
||||
|
||||
is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
|
||||
|
||||
for instr_size, a_type, b_type, acc_type in product(instruction_sizes_1sm, ab_types, ab_types, acc_types):
|
||||
is_runtime_datatype_a = is_runtime_datatype(a_type)
|
||||
is_runtime_datatype_b = is_runtime_datatype(b_type)
|
||||
|
||||
# A/B datatypes should be both static or dynamic
|
||||
if (is_runtime_datatype_a != is_runtime_datatype_b):
|
||||
continue
|
||||
|
||||
math_instructions_1sm.append(
|
||||
MathInstruction(
|
||||
instr_size,
|
||||
a_type, b_type, acc_type,
|
||||
OpcodeClass.BlockScaledTensorOp,
|
||||
MathOperation.multiply_add,
|
||||
DataType.ue8m0) # UE8M0 scale factor
|
||||
)
|
||||
|
||||
math_instructions_2sm = []
|
||||
|
||||
for instr_size, a_type, b_type, acc_type in product(instruction_sizes_2sm, ab_types, ab_types, acc_types):
|
||||
is_runtime_datatype_a = is_runtime_datatype(a_type)
|
||||
is_runtime_datatype_b = is_runtime_datatype(b_type)
|
||||
|
||||
# A/B datatypes should be both static or dynamic
|
||||
if (is_runtime_datatype_a != is_runtime_datatype_b):
|
||||
continue
|
||||
|
||||
math_instructions_2sm.append(
|
||||
MathInstruction(
|
||||
instr_size,
|
||||
a_type, b_type, acc_type,
|
||||
OpcodeClass.BlockScaledTensorOp,
|
||||
MathOperation.multiply_add,
|
||||
DataType.ue8m0) # UE8M0 scale factor
|
||||
)
|
||||
|
||||
cluster_shapes_1sm = [
|
||||
[1,1,1],
|
||||
# [1,2,1],
|
||||
[2,1,1],
|
||||
# [1,4,1],
|
||||
[4,4,1]
|
||||
, DynamicClusterShape
|
||||
]
|
||||
|
||||
# 1xSM MMA kernels
|
||||
for math_inst in math_instructions_1sm:
|
||||
tile_descriptions = []
|
||||
for cluster_shape in cluster_shapes_1sm:
|
||||
multiplier_1sm = cluster_shape
|
||||
tile_descriptions.append(
|
||||
TileDescription([
|
||||
math_inst.instruction_shape[0] * multiplier_1sm[0],
|
||||
math_inst.instruction_shape[1] * multiplier_1sm[1],
|
||||
768],
|
||||
0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
|
||||
|
||||
data_types = [
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.void,
|
||||
"d_type" : DataType.f32,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.bf16,
|
||||
"d_type" : DataType.bf16,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.void,
|
||||
"d_type" : DataType.e2m1,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
|
||||
},
|
||||
]
|
||||
|
||||
for layout in layouts:
|
||||
for data_type in data_types:
|
||||
# Set alignment d based on Destination format.
|
||||
if DataTypeSize[data_type["c_type"]] == 0 :
|
||||
layout[2][1] = 256 // DataTypeSize[data_type["d_type"]]
|
||||
else:
|
||||
layout[2][1] = min(256 // DataTypeSize[data_type["d_type"]], 256 // DataTypeSize[data_type["c_type"]])
|
||||
|
||||
if data_type["sfd_type"]["type"] != DataType.void and (data_type["d_type"] == DataType.e2m1):
|
||||
data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
|
||||
# E2M1 x E2M1, vector size 32, E8
|
||||
isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
|
||||
|
||||
fp4_schedule = [KernelScheduleType.BlockScaledMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103, EpilogueScheduleType.NoSmemWarpSpecialized1Sm]
|
||||
fp4_schedule_disable_prefetch = [KernelScheduleType.BlockScaledMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch, EpilogueScheduleType.NoSmemWarpSpecialized1Sm]
|
||||
fp4_schedule_enable_prefetch = [KernelScheduleType.BlockScaledMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch, EpilogueScheduleType.NoSmemWarpSpecialized1Sm]
|
||||
# For FP4 inputs
|
||||
if isFp4:
|
||||
CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, [fp4_schedule, fp4_schedule_disable_prefetch
|
||||
,fp4_schedule_enable_prefetch
|
||||
]
|
||||
, gemm_kind=gemm_kind
|
||||
)
|
||||
|
||||
cluster_shapes_2sm = [
|
||||
[2,1,1],
|
||||
# [2,2,1],
|
||||
# [2,4,1],
|
||||
[4,1,1],
|
||||
# [4,2,1],
|
||||
[4,4,1]
|
||||
, DynamicClusterShape
|
||||
]
|
||||
|
||||
for math_inst in math_instructions_2sm:
|
||||
tile_descriptions = []
|
||||
for cluster_shape in cluster_shapes_2sm:
|
||||
multiplier_2sm = (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
|
||||
tile_descriptions.append(
|
||||
TileDescription([
|
||||
math_inst.instruction_shape[0] * multiplier_2sm[0],
|
||||
math_inst.instruction_shape[1] * multiplier_2sm[1],
|
||||
math_inst.instruction_shape[2] * 8 * multiplier_2sm[2]],
|
||||
0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
|
||||
|
||||
data_types = [
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.void,
|
||||
"d_type" : DataType.f32,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.bf16,
|
||||
"d_type" : DataType.bf16,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
|
||||
},
|
||||
|
||||
{
|
||||
"a_type" : math_inst.element_a,
|
||||
"b_type" : math_inst.element_b,
|
||||
"c_type" : DataType.void,
|
||||
"d_type" : DataType.e2m1,
|
||||
"acc_type" : math_inst.element_accumulator,
|
||||
"epi_type" : epi_type,
|
||||
"sf_type" : math_inst.element_scale_factor,
|
||||
"sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
|
||||
},
|
||||
]
|
||||
|
||||
for layout in layouts:
|
||||
for data_type in data_types:
|
||||
# Set alignment d based on Destination format.
|
||||
if DataTypeSize[data_type["c_type"]] == 0 :
|
||||
layout[2][1] = 256 // DataTypeSize[data_type["d_type"]]
|
||||
else:
|
||||
layout[2][1] = min(256 // DataTypeSize[data_type["d_type"]], 256 // DataTypeSize[data_type["c_type"]])
|
||||
|
||||
if data_type["sfd_type"]["type"] != DataType.void and (data_type["d_type"] == DataType.e2m1):
|
||||
data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
|
||||
# E2M1 x E2M1, vector size 32, E8
|
||||
isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
|
||||
|
||||
fp4_schedule = [KernelScheduleType.BlockScaledMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103, EpilogueScheduleType.NoSmemWarpSpecialized2Sm]
|
||||
fp4_schedule_disable_prefetch = [KernelScheduleType.BlockScaledMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch, EpilogueScheduleType.NoSmemWarpSpecialized2Sm]
|
||||
fp4_schedule_enable_prefetch = [KernelScheduleType.BlockScaledMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch, EpilogueScheduleType.NoSmemWarpSpecialized2Sm]
|
||||
# For FP4 inputs
|
||||
if isFp4:
|
||||
CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, [fp4_schedule, fp4_schedule_disable_prefetch
|
||||
,fp4_schedule_enable_prefetch
|
||||
]
|
||||
, gemm_kind=gemm_kind
|
||||
)
|
||||
|
||||
def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
|
||||
if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
|
||||
@ -8732,7 +9019,8 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
|
||||
[[LayoutType.RowMajor, 16], [LayoutType.RowMajor, 16], [LayoutType.RowMajor, 0]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
@ -8948,9 +9236,11 @@ def GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version):
|
||||
[[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
tile_schedulers = [
|
||||
TileSchedulerType.Default,
|
||||
]
|
||||
@ -9074,9 +9364,11 @@ def GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version):
|
||||
[[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
tile_schedulers = [
|
||||
TileSchedulerType.Default,
|
||||
]
|
||||
@ -9200,7 +9492,8 @@ def GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version):
|
||||
[[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
@ -9326,9 +9619,11 @@ def GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version):
|
||||
[[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
tile_schedulers = [
|
||||
TileSchedulerType.Default,
|
||||
]
|
||||
@ -9465,9 +9760,11 @@ def GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
|
||||
[[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
tile_schedulers = [
|
||||
TileSchedulerType.Default,
|
||||
]
|
||||
@ -9678,9 +9975,11 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version):
|
||||
}
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
math_instructions_1sm = [
|
||||
MathInstruction(
|
||||
[128, 256, 8],
|
||||
@ -9772,9 +10071,11 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version):
|
||||
[[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 0]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
math_instructions_1sm = [
|
||||
MathInstruction(
|
||||
[128, 256, 16],
|
||||
@ -9934,9 +10235,11 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version):
|
||||
[[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor, 0]],
|
||||
]
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
min_cc = 100
|
||||
max_cc = thor_sm
|
||||
|
||||
epi_type = DataType.f32
|
||||
|
||||
math_instructions_1sm = [
|
||||
@ -10084,7 +10387,8 @@ def GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version,
|
||||
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
||||
return
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
minimum_compute_capability = 100
|
||||
maximum_compute_capability = thor_sm
|
||||
|
||||
@ -10238,7 +10542,8 @@ def GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version,
|
||||
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
||||
return
|
||||
|
||||
thor_sm = 101
|
||||
thor_sm = ThorSMRenumbering(cuda_version)
|
||||
|
||||
minimum_compute_capability = 100
|
||||
maximum_compute_capability = thor_sm
|
||||
|
||||
@ -10422,7 +10727,7 @@ def GenerateSM120_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud
|
||||
return [TileSchedulerType.Default, TileSchedulerType.StreamK]
|
||||
|
||||
min_cc = 120
|
||||
max_cc = 120
|
||||
max_cc = 121
|
||||
|
||||
epi_type = DataType.f32
|
||||
|
||||
@ -10567,7 +10872,7 @@ def GenerateSM120_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio
|
||||
return [TileSchedulerType.Default, TileSchedulerType.StreamK]
|
||||
|
||||
min_cc = 120
|
||||
max_cc = 120
|
||||
max_cc = 121
|
||||
|
||||
epi_type = DataType.f32
|
||||
|
||||
@ -10720,7 +11025,7 @@ def GenerateSM120_Sparse_TensorOp_gemm(manifest, cuda_version):
|
||||
return [TileSchedulerType.Default]
|
||||
|
||||
min_cc = 120
|
||||
max_cc = 120
|
||||
max_cc = 121
|
||||
|
||||
kernel_schedules = [
|
||||
KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120,
|
||||
@ -10840,7 +11145,7 @@ def GenerateSM120_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version,
|
||||
return [TileSchedulerType.Default]
|
||||
|
||||
min_cc = 120
|
||||
max_cc = 120
|
||||
max_cc = 121
|
||||
|
||||
kernel_schedulers = [
|
||||
KernelScheduleType.BlockwiseTmaWarpSpecializedCooperativeSm120,
|
||||
@ -10924,7 +11229,11 @@ def GenerateSM120_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version,
|
||||
gemm_kind = gemm_kind)
|
||||
|
||||
def GenerateSM100(manifest, cuda_version):
|
||||
arch_family_cc = ['100f', '101f']
|
||||
arch_family_cc = ['100f', '101f', '103a']
|
||||
if CudaToolkitVersionSatisfies(cuda_version, 13, 0):
|
||||
for old_cc, new_cc in [('101f', '110f')]:
|
||||
arch_family_cc = [cc.replace(old_cc, new_cc) for cc in arch_family_cc]
|
||||
|
||||
#
|
||||
# Dense Gemm
|
||||
#
|
||||
@ -10966,8 +11275,11 @@ def GenerateSM100(manifest, cuda_version):
|
||||
# Block Scaled Gemm
|
||||
#
|
||||
GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version)
|
||||
GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockScaledUniversal3x)
|
||||
GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version)
|
||||
GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockScaledUniversal3x)
|
||||
|
||||
GenerateSM103_TensorOp_fp4_ultra_UMMA_gemm_with_block_scaled(manifest, cuda_version)
|
||||
#
|
||||
# Conv
|
||||
#
|
||||
@ -11413,7 +11725,6 @@ def numeric_log_level(log_level: str) -> int:
|
||||
raise ValueError(f'Invalid log level: {log_level}')
|
||||
return numeric_level
|
||||
|
||||
|
||||
# This function for defining the ArgumentParser is used to make it easy for the CUTLASS Python interface
|
||||
# to leverage the functionality in this file without running this script via a shell prompt.
|
||||
def define_parser():
|
||||
@ -11438,6 +11749,11 @@ def define_parser():
|
||||
parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose compute capability range is not satisfied by the build target are excluded.')
|
||||
parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
|
||||
parser.add_argument('--kernel-filter-file', type=str, default=None, required=False, help='Full path of filter file')
|
||||
parser.add_argument('--heuristics-problems-file', type=str, default=None, required=False, help='Full path of heuristics problem size description file, as a json list')
|
||||
parser.add_argument('--heuristics-testlist-file', type=str, default=None, required=False, help='Full path of heuristics testlist CSV file, to be passed to cutlass_profiler')
|
||||
parser.add_argument('--heuristics-gpu', type=str, default=None, required=False, help='GPU to use for evaluating heuristics offline. None or `auto` to autodetect using cuda', choices=['', 'auto', 'H100_SXM', 'H100_PCIE', 'H100_NVL', 'H200_SXM', 'H20_SXM', 'B200', 'GB200_NVL', 'RTX_5080', 'RTX_5090', 'RTX_PRO_6000'])
|
||||
parser.add_argument('--heuristics-configs-per-problem', type=int, default=10, required=False, help='Number of kernel configs to generate for each problem in the problem list')
|
||||
parser.add_argument('--heuristics-restrict-kernels', action='store_true', help='Restrict heuristics mode to use only the default set of kernels emitted by generator.py')
|
||||
parser.add_argument('--selected-kernel-list', type=str, default=None, required=False,
|
||||
help='Specify the output log file containing all enabled kernels in this build')
|
||||
parser.add_argument("--interface-dir", default=None, required=False, help="Interface header to kernels")
|
||||
@ -11460,6 +11776,9 @@ if __name__ == "__main__":
|
||||
|
||||
archs = args.architectures.split(';')
|
||||
|
||||
if args.heuristics_problems_file:
|
||||
filter_manifest_and_write_heuristics_file(manifest, args)
|
||||
|
||||
GenerateSM50(manifest, args.cuda_version)
|
||||
GenerateSM60(manifest, args.cuda_version)
|
||||
GenerateSM61(manifest, args.cuda_version)
|
||||
@ -11468,17 +11787,20 @@ if __name__ == "__main__":
|
||||
GenerateSM80(manifest, args.cuda_version)
|
||||
GenerateSM89(manifest, args.cuda_version)
|
||||
GenerateSM90(manifest, args.cuda_version)
|
||||
|
||||
|
||||
blackwell_arch_list = [
|
||||
"100a", "100f",
|
||||
"101a", "101f",
|
||||
"120a", "120f"
|
||||
"103a", "103f",
|
||||
"110a", "110f",
|
||||
"120a", "120f",
|
||||
"121a", "121f",
|
||||
]
|
||||
blackwell_enabled_arch = any(arch in blackwell_arch_list for arch in archs)
|
||||
if blackwell_enabled_arch:
|
||||
GenerateSM100(manifest, args.cuda_version)
|
||||
GenerateSM120(manifest, args.cuda_version)
|
||||
|
||||
|
||||
if 'library' in args.generator_target.split(','):
|
||||
manifest.emit(GeneratorTarget.Library)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user