CUTLASS 3.7 (#2045)

* CUTLASS 3.7 * clean up changelog --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2025-01-18 06:53:07 -08:00
parent 902dff3663
commit b78588d163
2030 changed files with 8947 additions and 3475 deletions
--- a/python/cutlass_library/init.py
+++ b/python/cutlass_library/init.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/conv2d_operation.py
+++ b/python/cutlass_library/conv2d_operation.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/conv3d_operation.py
+++ b/python/cutlass_library/conv3d_operation.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/conv3x_emitter.py
+++ b/python/cutlass_library/conv3x_emitter.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@ -5203,7 +5203,7 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments.
@ -5268,7 +5268,7 @@ def GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9992)
  is_aligned = False

  # layouts for ABC and their alignments.
@ -5329,7 +5329,7 @@ def GenerateSM90_SparseTensorOp_16b_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments.
@ -5394,7 +5394,7 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments
@ -5452,7 +5452,7 @@ def GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9992)
  is_aligned = False

  # layouts for ABC and their alignments.
@ -5509,7 +5509,7 @@ def GenerateSM90_SparseTensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments
@ -5564,7 +5564,7 @@ def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments
@ -5616,7 +5616,7 @@ def GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
  is_aligned = False

  # layouts for ABC and their alignments
@ -5668,7 +5668,7 @@ def GenerateSM90_SparseTensorOp_int8_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments
@ -5723,7 +5723,7 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments
@ -5789,7 +5789,7 @@ def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=0, default_level=101, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=0, default_level=101, exhaustive_level=9992)
  is_aligned = False

  # layouts for ABC and their alignments
@ -5847,7 +5847,7 @@ def GenerateSM90_SparseTensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
    return

-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
  is_aligned = True

  # layouts for ABC and their alignments
--- a/python/cutlass_library/library.py
+++ b/python/cutlass_library/library.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/manifest.py
+++ b/python/cutlass_library/manifest.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@ -563,17 +563,31 @@ class Manifest:
    self.operations_by_name = {}
    self.disable_full_archs_compilation = args.disable_full_archs_compilation
    self.is_kernel_filter_set_to_all = args.instantiation_level == "max" and args.kernels != ''
+    self.instantiation_level = 0
+    try:
+        self.instantiation_level = int(args.instantiation_level)
+    except ValueError:
+        self.instantiation_level = 0

-  def get_sm90_instantiation_level(self, pruned_level=0, default_level=111, exhaustive_level=9999):
+  def get_sm90_instantiation_level(self, pruned_level=0, default_level=111, exhaustive_level=9992):
    # Non-negative integer which determines how many kernels are instantiated.
    # 0 = 0000 generates the fewest kernels, 9999 generates all possible combinations.
    # increasing first digit reduces schedule / mixed type pruning,
    # increasing second digit generates more cluster sizes,
-    # increasing third digit generates more MMA shapes,
+    # increasing third digit generates more MMA multipliers,
    # increasing fourth digit generates more instruction shapes.
-    return exhaustive_level if self.is_kernel_filter_set_to_all else (
-      pruned_level if self.kernel_filter == '' else default_level
-    )
+
+    if self.instantiation_level > 0:
+        return self.instantiation_level
+
+    elif self.is_kernel_filter_set_to_all:
+        return exhaustive_level
+
+    elif self.kernel_filter == '':
+        return pruned_level
+
+    else:
+        return default_level


  def get_kernel_filters (self, kernelListFile):
--- a/python/cutlass_library/rank_2k_operation.py
+++ b/python/cutlass_library/rank_2k_operation.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/rank_k_operation.py
+++ b/python/cutlass_library/rank_k_operation.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/sm90_shapes.py
+++ b/python/cutlass_library/sm90_shapes.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/sm90_utils.py
+++ b/python/cutlass_library/sm90_utils.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@ -261,21 +261,30 @@ def is_tile_desc_valid(tile_description):
        tile_description.math_instruction.element_accumulator
    )

-    cluster_shape, cta_shape, inst_shape = (
+    cluster_size, cta_shape = (
        tile_description.cluster_shape,
        tile_description.threadblock_shape,
-        tile_description.math_instruction.instruction_shape
    )
    grid_size = (
-        cta_shape[0] * cluster_shape[0] +
-        cta_shape[1] * cluster_shape[1] +
-        cta_shape[2] * cluster_shape[2]
+        cta_shape[0] * cluster_size[0] +
+        cta_shape[1] * cluster_size[1] +
+        cta_shape[2] * cluster_size[2]
    )
-    cluster_size = cluster_shape[0] * cluster_shape[1] * cluster_shape[2]
+    num_ctas_in_cluster = cluster_size[0] * cluster_size[1] * cluster_size[2]
+    cluster_shape = (
+        cluster_size[0] * cta_shape[0],
+        cluster_size[1] * cta_shape[1],
+        cluster_size[2] * cta_shape[2]
+    )
+
+    FP32_TYPES = [DataType.f32, DataType.tf32]
+    FP16_TYPES = [DataType.f16, DataType.bf16]
+    is_fp32 = element_a in FP32_TYPES and element_b in FP32_TYPES
+    is_fp16 = element_a in FP16_TYPES and element_b in FP16_TYPES

    # Maximum number of CTAs per cluster is 8 for Hopper, but up to 16 is
    # allowed for non portable clusters.
-    if cluster_size > 16 or cluster_size < 1:
+    if num_ctas_in_cluster > 16 or num_ctas_in_cluster < 1:
        return False

    if grid_size < 1:
@ -299,8 +308,17 @@ def is_tile_desc_valid(tile_description):
    if cta_shape[2] < 16 or cta_shape[2] % 8 != 0:
        return False

-    # Minimum of 2 stages
-    if cta_shape[2] < inst_shape[2] or cta_shape[2] % inst_shape[2] != 0 or cta_shape[2] / inst_shape[2] < 2:
+    # Minimum of 2 stages (very rough heuristic that may filter out valid kernel configs)
+    if (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 256:
+        return False
+
+    if is_fp32 and (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 128:
+        return False
+
+    if is_fp32 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 64:
+        return False
+
+    if is_fp16 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 128:
        return False

    # CTA shape upper bound: <256, 256, 256>
@ -329,6 +347,7 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level:
    tile_descriptions = set()
    mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned)
    for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes):
+
        tile_desc = TileDescription(
            threadblock_shape=[
                math_inst.instruction_shape[0] * mma_mul[0],
@ -426,6 +445,25 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
    d_type = data_types["d_type"]
    is_void_c = c_type == DataType.void

+    # Filter out invalid kernels
+    is_nt = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.RowMajor
+    is_tn = layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.ColumnMajor
+    is_nn = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.ColumnMajor
+
+    # static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    #   "Copy size must evenly divide SMEM tile.");
+    if is_fp32 and is_nt and (cta_n % cta_k != 0):
+        return [], []
+
+    # static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
+    # "SmemLayoutB K must be 128bytes to be transposed.")
+    if is_fp32 and is_nt and cta_k != 32:
+        return [], []
+
+    # Static assert failure when instantiating SmemLayoutB
+    if is_fp32 and (is_tn or is_nn) and (cta_n % cta_k != 0):
+        return [], []
+
    # Early pruning
    if level < 1:
        # Don't stamp out FP16/BF16 kernels smaller than or equal to 64x128x64
@ -441,7 +479,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
            if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative and can_do_tma_epilogue:
                return [
                    [
-                        KernelScheduleType.TmaWarpSpecializedCooperative if not is_sparse else KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        KernelScheduleType.TmaWarpSpecializedCooperative,
                        EpilogueScheduleType.TmaWarpSpecializedCooperative
                    ],
                    [
@ -489,8 +527,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
        # Pruning: don't stamp out fp8 kernels with auto schedule
        if not is_fp8:
            schedules.append([KernelScheduleType.ScheduleAuto, auto_epilogue])
-        if not (is_fp8 and is_sparse):
-            schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
+        schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
    stream_k_schedules = []
    
    if CudaToolkitVersionSatisfies(cuda_version, 12, 0):
@ -518,16 +555,14 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
            schedules.append([KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, default_epilogue])

        if can_do_cooperative:
-            # Sparse kernels only support FastAccum FP8 mainloop
-            if not (is_fp8 and is_sparse):
-                schedules.append([
-                    KernelScheduleType.TmaWarpSpecializedCooperative,
-                    default_epilogue
-                ])
-                stream_k_schedules.append([
-                    KernelScheduleType.TmaWarpSpecializedCooperative,
-                    default_epilogue
-                ])
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecializedCooperative,
+                default_epilogue
+            ])
+            stream_k_schedules.append([
+                KernelScheduleType.TmaWarpSpecializedCooperative,
+                default_epilogue
+            ])
            if can_do_fp8_fast_accum:
                schedules.append([
                    KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
@ -542,16 +577,14 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
        if can_do_tma_epilogue:
            assert not requires_transposed_epilogue
            if can_do_cooperative:
-                # Sparse kernels only support FastAccum FP8 mainloop
-                if not (is_fp8 and is_sparse):
-                    schedules.append([
-                        KernelScheduleType.TmaWarpSpecializedCooperative,
-                        EpilogueScheduleType.TmaWarpSpecializedCooperative
-                    ])
-                    stream_k_schedules.append([
-                        KernelScheduleType.TmaWarpSpecializedCooperative,
-                        EpilogueScheduleType.TmaWarpSpecializedCooperative
-                    ])
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    EpilogueScheduleType.TmaWarpSpecializedCooperative
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    EpilogueScheduleType.TmaWarpSpecializedCooperative
+                ])
                if can_do_fp8_fast_accum:
                    schedules.append([
                        KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
--- a/python/cutlass_library/symm_operation.py
+++ b/python/cutlass_library/symm_operation.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
--- a/python/cutlass_library/trmm_operation.py
+++ b/python/cutlass_library/trmm_operation.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without