CUTLASS 3.3.0 (#1167)

* Release 3.3.0 Adds support for mixed precision GEMMs On Hopper and Ampere Adds support for < 16B aligned GEMMs on Hopper Enhancements to EVT Enhancements to Python interface Enhancements to Sub-byte type handling in CuTe Several other bug-fixes and performance improvements. * minor doc update
2023-11-02 08:09:05 -07:00
parent 922fb5108b
commit c008b4aea8
263 changed files with 16214 additions and 5008 deletions
--- a/python/cutlass_library/init.py
+++ b/python/cutlass_library/init.py
@ -30,6 +30,7 @@
 #
 #################################################################################################

+import os
 import sys

 from . import conv2d_operation
@ -47,3 +48,16 @@ from . import rank_2k_operation
 from . import rank_k_operation
 from . import symm_operation
 from . import trmm_operation
+
+# Make enum types from library.py accessible via cutlass_library.*
+from .library import *
+
+# Set up `source` to point to the path containing the CUTLASS source.
+# Check first if the path cotains a `source` subdirectory -- this will
+# be the case when the package has been installed via pip. Otherwise,
+# default to the root of CUTLASS.
+install_source_path = os.path.join(__path__[0], 'source')
+if os.path.isdir(install_source_path):
+    source_path = install_source_path
+else:
+    source_path = os.path.join(__path__[0], '../..')
--- a/python/cutlass_library/conv2d_operation.py
+++ b/python/cutlass_library/conv2d_operation.py
@ -38,7 +38,13 @@ import enum
 import os.path
 import shutil

-from cutlass_library.library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *

 ###################################################################################################

@ -62,11 +68,6 @@ class Conv2dOperation:
    self.stride_support = stride_support
    self.swizzling_functor = swizzling_functor
    self.group_mode = group_mode
-
-  #
-  def is_mixed_input(self):
-    return self.A.element != self.B.element
-  
  #
  def is_complex(self):
    complex_operators = [
@ -75,6 +76,10 @@ class Conv2dOperation:
      ]
    return self.tile_description.math_instruction.math_operation in complex_operators

+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
  #
  def accumulator_type(self):
    accum = self.tile_description.math_instruction.element_accumulator
@ -262,7 +267,7 @@ class EmitConv2dInstance:
          1,
          ${threadblock_output_shape_n},
          ${threadblock_output_shape_p},
-          ${threadblock_output_shape_q}>, 
+          ${threadblock_output_shape_q}>,
    ${stages},
    ${math_operator},
    ${iterator_algorithm},
--- a/python/cutlass_library/conv3d_operation.py
+++ b/python/cutlass_library/conv3d_operation.py
@ -38,7 +38,13 @@ import enum
 import os.path
 import shutil

-from cutlass_library.library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *

 ###################################################################################################

@ -60,11 +66,11 @@ class Conv3dOperation:
    self.iterator_algorithm = iterator_algorithm
    self.stride_support = stride_support
    self.swizzling_functor = swizzling_functor
-  
+
  #
  def is_mixed_input(self):
    return self.A.element != self.B.element
-  
+
  #
  def core_name(self):
    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@ -34,14 +34,20 @@
 Utilities for emitting GEMM kernels
 """

+import collections
 import enum
-import os.path
-import shutil
 import functools
 import operator
-import collections
+import os.path
+import shutil

-from cutlass_library.library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *

 ###################################################################################################
 #
@ -55,9 +61,14 @@ class GemmOperation:
  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
      kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto,
-      tile_scheduler = TileSchedulerType.Default):
+      tile_scheduler = TileSchedulerType.Default, extra_args = None):

-    self.prefix = "3x" if gemm_kind == GemmKind.Universal3x else ""
+    kinds_3x = {
+      GemmKind.Universal3x,
+      GemmKind.SparseUniversal3x,
+    }
+    self.is_3x = gemm_kind in kinds_3x
+    self.prefix = "3x" if self.is_3x else ""
    self.operation_kind = OperationKind.Gemm
    self.arch = arch
    self.tile_description = tile_description
@ -66,10 +77,11 @@ class GemmOperation:
    self.B = B
    self.C = C
    self.D = D
+
    if self.D == None:
      self.D = self.C

-    if gemm_kind != GemmKind.Universal3x:
+    if not self.is_3x:
      assert(kernel_schedule == KernelScheduleType.ScheduleAuto)
      assert(epilogue_schedule == EpilogueScheduleType.ScheduleAuto)
    self.kernel_schedule = kernel_schedule
@ -91,7 +103,7 @@ class GemmOperation:
  #
  def is_mixed_input(self):
    return self.A.element != self.B.element
-  
+
  #
  def is_planar_complex(self):
    return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray)
@ -125,13 +137,20 @@ class GemmOperation:
      MathOperation.and_popc: 'and'
    }

-    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
-      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+    tensor_ops = [
+      OpcodeClass.TensorOp,
+      OpcodeClass.WmmaTensorOp,
+      OpcodeClass.SparseTensorOp,
+    ]
+
+    is_tensor_op = self.tile_description.math_instruction.opcode_class in tensor_ops
+
+    if is_tensor_op:

      math_op = self.tile_description.math_instruction.math_operation
      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''

-      if self.gemm_kind == GemmKind.Universal3x:
+      if self.is_3x:
        inst_shape = "{0}x{1}x{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
      else:
        inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape))
@ -183,6 +202,16 @@ class GemmOperation:
      core_name = self.core_name())
    return extended_name

+  def datatype_name_3x(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    datatype_name = "{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+      element_a = DataTypeNames[self.A.element],
+      element_b = DataTypeNames[self.B.element],
+      element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator],
+      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element])
+    return datatype_name
+
  # Generates a short string representing the AB layout tags (e.g. nt or tn)
  def layout_name(self):
    if self.is_complex() or self.is_planar_complex():
@ -213,6 +242,10 @@ class GemmOperation:
  def epilogue_schedule_name_3x(self):
    return EpilogueScheduleSuffixes[self.epilogue_schedule]

+  # Generate a short string representing the operation class
+  def opcode_class_name(self):
+    return OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
  # Generates the full kernel function name
  def procedural_name(self):
    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
@ -661,7 +694,6 @@ ${compile_guard_end}

 ###################################################################################################

-#
 class EmitGemmUniversal3xInstance:
  ''' Responsible for emitting a CUTLASS 3.x template definition'''

@ -687,10 +719,10 @@ class EmitGemmUniversal3xInstance:

 using ${operation_name}_epilogue =
  typename cutlass::epilogue::collective::CollectiveBuilder<
-    ${arch}, ${opcode_class},
+    ${arch}, ${opcode_class_epi},
    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
-    cutlass::epilogue::collective::EpilogueTileAuto,
+    ${epi_tile_mn},
    ${element_accumulator}, ${element_epilogue},
    ${element_c}, ${layout_c}, ${align_c},
    ${element_d}, ${layout_d}, ${align_d},
@ -699,7 +731,7 @@ using ${operation_name}_epilogue =

 using ${operation_name}_mainloop =
  typename cutlass::gemm::collective::CollectiveBuilder<
-    ${arch}, ${opcode_class},
+    ${arch}, ${opcode_class_main},
    ${element_a}, ${layout_a}, ${align_a},
    ${element_b}, ${layout_b}, ${align_b},
    ${element_accumulator},
@ -743,6 +775,10 @@ ${compile_guard_end}
      stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage)>"
    warp_shape = [tile_shape[idx] // warp_count[idx] for idx in range(3)]

+    epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+    opcode_class_main = operation.tile_description.math_instruction.opcode_class
+    opcode_class_epi = opcode_class_main
+
    instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
      (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)

@ -760,20 +796,23 @@ ${compile_guard_end}
    else:
      epilogue_functor = self.epilogue_functor.emit_declaration()
    #
-
+    element_a = DataTypeTag[operation.A.element]
+    element_b = DataTypeTag[operation.B.element]
+    epilogue_schedule_type = EpilogueScheduleTag[operation.epilogue_schedule]
    values = {
      'operation_name': operation.procedural_name(),
      'operation_suffix': self.operation_suffix,
-      'element_a': DataTypeTag[operation.A.element],
+      'element_a': element_a,
      'layout_a': LayoutTag[instance_layout_A],
-      'element_b': DataTypeTag[operation.B.element],
+      'element_b': element_b,
      'layout_b': LayoutTag[instance_layout_B],
      'element_c': DataTypeTag[operation.C.element],
      'layout_c': LayoutTag[instance_layout_C],
      'element_d': DataTypeTag[operation.D.element],
      'layout_d': LayoutTag[instance_layout_D],
      'element_accumulator': DataTypeTag[operation.accumulator_type()],
-      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'opcode_class_main': OpcodeClassTag[opcode_class_main],
+      'opcode_class_epi': OpcodeClassTag[opcode_class_epi],
      'arch': "cutlass::arch::Sm%d" % operation.arch,
      'tile_shape_m': str(operation.tile_description.tile_shape[0]),
      'tile_shape_n': str(operation.tile_description.tile_shape[1]),
@ -788,7 +827,8 @@ ${compile_guard_end}
      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
      'kernel_schedule' : str(KernelScheduleTag[operation.kernel_schedule]),
-      'epilogue_schedule' : str(EpilogueScheduleTag[operation.epilogue_schedule]),
+      'epilogue_schedule' : str(epilogue_schedule_type),
+      'epi_tile_mn' : epi_tile_mn,
      'epilogue_functor': epilogue_functor,
      'stages': stage_count_string,
      'align_a': str(operation.A.alignment),
@ -800,7 +840,7 @@ ${compile_guard_end}
      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
      'epilogue_vector_length': str(epilogue_vector_length),
      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
-      'tile_scheduler': str(TileSchedulerTag[operation.tile_scheduler])
+      'tile_scheduler': str(TileSchedulerTag[operation.tile_scheduler]),
    }

    return SubstituteTemplate(self.gemm_template, values)
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@ -34,16 +34,52 @@
 Utilities for enumerating CUTLASS library kernels
 """

+import argparse
 import enum
+from itertools import product
+import logging
 import os.path
 import shutil
-import argparse
-import logging

-from cutlass_library.library import *
-from cutlass_library.manifest import *
-from itertools import product
+import sys

+
+# Certain usecases of cutlass_library nearly always prefer to run as scripts with
+# relative imports, rather than via an installed Python package. An example of this
+# is using CUTLASS's CMake system to generate a library of kernels to be profiled.
+# To make it easy to use these use cases when an existing installation of cutlass_library
+# exists, this global flag can be set to true (via command-line arguments) to ensure
+# that package-based installations are not used.
+
+# Create a temporary argument parser to check only for the availability of the
+# --disable-cutlass-package-imports argument, which controls whether package-based
+# imports are disabled.
+def _add_package_disablement_flag(argparser):
+  argparser.add_argument("--disable-cutlass-package-imports", action='store_true', required=False,
+                     help="Disable use of cutlass_library from Python package")
+
+_parser = argparse.ArgumentParser()
+_add_package_disablement_flag(_parser)
+_args, _ = _parser.parse_known_args()
+
+# Add `CUTLASS_IGNORE_PACKAGE` to `builtins` so that it is visible for gating future
+# imports without requiring importing another module. Ideally, we would just place this
+# as a global variable in a module to that could be imported and checked (e.g.,
+# utils.CUTLASS_IGNORE_PACKAGE). However, this raises the issue of determining
+# where this module should be sourced (from the cutlass_library package or from
+# a relative import), which is the problem this variable is being used to solve in the
+# first place.
+import builtins
+builtins.CUTLASS_IGNORE_PACKAGE = _args.disable_cutlass_package_imports
+
+try:
+  if CUTLASS_IGNORE_PACKAGE:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.manifest import *
+except ImportError:
+  from library import *
+  from manifest import *
 ###################################################################################################

 #
@ -79,7 +115,7 @@ def EpilogueAlignment(max_alignment, tile, epilogue_steps = 8):
  return min(max_alignment, elements_per_thread)

 def DefaultSwizzlingFunctor():
-    return SwizzlingFunctor.Identity8;
+    return SwizzlingFunctor.Identity8
    # To use StreamK decomposition for basic GEMMs, set `swizzling_functor = SwizzlingFunctor.StreamK`

 #
@ -103,7 +139,7 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
    for tile_description in tile_descriptions:
      for alignment in alignment_constraints:
        for complex_transform in complex_transforms:
-            
+
            # If alignment is a tuple or a list, then we have different alignments for A and B
            alignment_a = alignment if isinstance(alignment, int) else alignment[0]
            alignment_b = alignment if isinstance(alignment, int) else alignment[1]
@ -121,7 +157,6 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \

  return operations

-
 # Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
 def CreateGemmUniversal3xOperator(
    manifest, layouts, tile_descriptions, data_types,
@ -157,11 +192,14 @@ def CreateGemmUniversal3xOperator(
    C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
    D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])

+    extra_args = {}
+    gemm_kind = GemmKind.Universal3x
    element_compute = data_type.get("epi_type", data_type["acc_type"])
+
    operation = GemmOperation(
-        GemmKind.Universal3x, tile_description.minimum_compute_capability,
+        gemm_kind, tile_description.minimum_compute_capability,
        tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
-        kernel_schedule, epilogue_schedule, tile_scheduler)
+        kernel_schedule, epilogue_schedule, tile_scheduler, extra_args)

    manifest.append(operation)
    operations.append(operation)
@ -2153,7 +2191,6 @@ def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
      CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
        data_type_mixed, alignment_constraints, complex_transforms)

-
 #
 def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):

@ -2225,8 +2262,9 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
      math_inst.element_accumulator,
    ]

+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints)
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)

    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
    if math_inst.element_a != math_inst.element_accumulator:
@ -2239,14 +2277,13 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
      ]

      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
-        data_type_mixed, alignment_constraints) 
-    
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8) 
+
      for op in operations:
        if (DataTypeSize[op.C.element] == 16) and \
           (op.tile_description.threadblock_shape[1] <= 32):
          op.C.alignment = 4

-
 #
 def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):

@ -2287,8 +2324,7 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):
  # inner list contains the alignment constraints for operands/matrices 
  # [[alignA, alignB, alignC],..]
  alignment_constraints = [[8, 16, 8],]
-  
-  
+
  for math_inst in math_instructions:
    tile_descriptions = [
      # 128x128
@ -2321,8 +2357,9 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):
      math_inst.element_accumulator,
    ]

+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
    CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints)
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)

    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
    if math_inst.element_a != math_inst.element_accumulator:
@ -2335,12 +2372,12 @@ def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):
      ]

      operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
-        data_type_mixed, alignment_constraints) 
-    
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8) 
+
      for op in operations:
        if op.tile_description.threadblock_shape[1] <= 32:
          op.C.alignment = 4
-       
+
 #
 def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):

@ -2723,6 +2760,7 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version):

    for op in operations:
      op.C.alignment = 16
+#

 #
 def GenerateSM80_TensorOp_168256(manifest, cuda_version):
@ -4458,6 +4496,154 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
          [[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
          tile_schedulers=[TileSchedulerType.StreamK])

+#
+def GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = [
+    MathInstruction(
+      [64, 128, 16],
+      DataType.f16, DataType.f16, DataType.f16,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 16],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 16],
+      DataType.bf16, DataType.bf16, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 90
+  max_cc = 90
+
+  for math_inst in math_instructions:
+    tile_descriptions_small = [
+      # TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+      #   0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
+    ]
+    tile_descriptions_medium = [
+      TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+        0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
+      # TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1]*2, math_inst.instruction_shape[2]*4],
+      #   0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
+    ]
+    tile_descriptions = tile_descriptions_small + tile_descriptions_medium
+
+    data_type = {
+      "a_type"   : math_inst.element_a,
+      "b_type"   : math_inst.element_b,
+      "c_type"   : math_inst.element_accumulator,
+      "d_type"   : math_inst.element_accumulator,
+      "acc_type" : math_inst.element_accumulator,
+      "epi_type" : math_inst.element_accumulator
+    }
+
+    # Set alignment c based on Destination format.
+    for layout in layouts:
+      if data_type["c_type"] in [DataType.s32, DataType.f32]:
+        layout[2][1] = 4
+      elif data_type["c_type"] in [DataType.f16, DataType.bf16]:
+        layout[2][1] = 8
+
+    schedules = [
+      # [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+      [KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
+    ]
+    stream_k_schedules = []
+
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+      schedules += [
+        [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
+        # [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]
+      ]
+      stream_k_schedules += [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
+
+    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
+
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+      # Add stream-K variants
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
+
+    # persistent kernels with TMA epilogues
+    # if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+    #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+    #     [[KernelScheduleType.CpAsyncWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+    #      [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+
+    #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+    #     [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
+    #     tile_schedulers=[TileSchedulerType.StreamK])
+
+    #   # Emit instance without C allocation + load
+    #   data_type["c_type"] = DataType.void
+    #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+    #     [[KernelScheduleType.CpAsyncWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+    #      [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+
+    #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+    #     [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
+    #     tile_schedulers=[TileSchedulerType.StreamK])
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      data_type_mixed = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : math_inst.element_a,
+        "d_type"   : math_inst.element_a,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      # Set alignment c based on Destination format.
+      for layout in layouts:
+        if data_type_mixed["c_type"] in [DataType.s32, DataType.f32]:
+          layout[2][1] = 4
+        elif data_type_mixed["c_type"] in [DataType.f16, DataType.bf16]:
+          layout[2][1] = 8
+
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, schedules)
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
+
+      # persistent kernels with TMA epilogues
+      # if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+      #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
+      #     [[KernelScheduleType.CpAsyncWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+      #      [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+
+      #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
+      #     [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
+      #     tile_schedulers=[TileSchedulerType.StreamK])
+
+      #   # Emit instance without C allocation+load
+      #   data_type_mixed["c_type"] = DataType.void
+      #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
+      #     [[KernelScheduleType.CpAsyncWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+      #      [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+
+      #   CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
+      #     [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
+      #     tile_schedulers=[TileSchedulerType.StreamK])
+
 #
 def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
@ -4582,6 +4768,91 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
    CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions, data_types, schedules_default)
    CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_types, schedules_transposed_epilogue)

+#
+def GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    1], [LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 1], [LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_inst = MathInstruction(
+      [64, 128, 8],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add)
+  
+  min_cc = 90
+  max_cc = 90
+
+  tile_descriptions_medium = [
+    TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+      0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1])
+  ]
+ 
+  tile_descriptions_small = [
+    # TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+    #   0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1])
+  ]
+
+  tile_descriptions = tile_descriptions_medium + tile_descriptions_small
+
+  data_types = [
+    {
+      "a_type"   : math_inst.element_a,
+      "b_type"   : math_inst.element_b,
+      "c_type"   : math_inst.element_accumulator,
+      "d_type"   : math_inst.element_accumulator,
+      "acc_type" : math_inst.element_accumulator,
+      "epi_type" : math_inst.element_accumulator
+    },
+    {
+      "a_type"   : DataType.f32,
+      "b_type"   : DataType.f32,
+      "c_type"   : math_inst.element_accumulator,
+      "d_type"   : math_inst.element_accumulator,
+      "acc_type" : math_inst.element_accumulator,
+      "epi_type" : DataType.f32
+    }
+  ]
+
+  is_tt_layout = lambda v: v[0][0] == LayoutType.RowMajor and v[1][0] == LayoutType.RowMajor
+  # Split kernels into TN/NT, NN or TT layouts
+  layouts_tn_nn_nt = filter(lambda v: not is_tt_layout(v), layouts)
+  layouts_tt = filter(is_tt_layout, layouts)
+
+  CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
+    # [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+    [KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
+  ])
+
+  # Kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
+  # because they use NN kernels underneath and transposing its epilogue will get the correct output
+  CreateGemmUniversal3xOperator(manifest, layouts_tt, tile_descriptions, data_types, [
+    # [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
+    [KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
+  ])
+
+  if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+    CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
+      # [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
+      [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
+    ])
+
+    # Stream-K schedules
+    CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
+      [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
+    ], tile_schedulers=[TileSchedulerType.StreamK])
+
 #
 def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
@ -4677,6 +4948,81 @@ def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
           tile_schedulers=[TileSchedulerType.Persistent, TileSchedulerType.StreamK]
           )

+#
+def GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor,  8], [LayoutType.ColumnMajor,  8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,  4], [LayoutType.ColumnMajor,  4], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = [
+    MathInstruction(
+      [64, 128, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 32],
+      DataType.u8, DataType.u8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 90
+  max_cc = 90
+
+  for math_inst in math_instructions:
+    tile_descriptions_small = [
+      # TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+      #   0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
+    ]
+    tile_descriptions_medium = [
+      TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+        0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
+    ]
+    tile_descriptions = tile_descriptions_medium + tile_descriptions_small
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : math_inst.element_accumulator,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.s8,
+        "d_type"   : math_inst.element_a,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      }
+    ]
+
+    for data_type in data_types:
+      for layout in layouts:
+        layout[2][1] = 128 // DataTypeSize[data_type["d_type"]]
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, [
+        # [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.NoSmemWarpSpecialized],
+        [KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
+      ])
+
+      if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, [
+          # [KernelScheduleType.CpAsyncWarpSpecializedPingpong,    EpilogueScheduleType.NoSmemWarpSpecialized],
+          [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
+        ])
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+          [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]],
+          tile_schedulers=[TileSchedulerType.StreamK])
+
+#
 def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
    return
@ -4882,6 +5228,188 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
           [KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
          tile_schedulers=[TileSchedulerType.StreamK])

+#
+def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],  # TN Layout
+    [[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = [
+    # inst 64x128x32
+    MathInstruction(
+      [64, 128, 32],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 32],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 32],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [64, 128, 32],
+      DataType.e5m2, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    # inst 64x64x32
+    # MathInstruction(
+    #   [64, 64, 32],
+    #   DataType.e4m3, DataType.e4m3, DataType.f32,
+    #   OpcodeClass.TensorOp,
+    #   MathOperation.multiply_add),
+    # MathInstruction(
+    #   [64, 64, 32],
+    #   DataType.e4m3, DataType.e5m2, DataType.f32,
+    #   OpcodeClass.TensorOp,
+    #   MathOperation.multiply_add),
+    # MathInstruction(
+    #   [64, 64, 32],
+    #   DataType.e5m2, DataType.e4m3, DataType.f32,
+    #   OpcodeClass.TensorOp,
+    #   MathOperation.multiply_add),
+    # MathInstruction(
+    #   [64, 64, 32],
+    #   DataType.e5m2, DataType.e5m2, DataType.f32,
+    #   OpcodeClass.TensorOp,
+    #   MathOperation.multiply_add),
+  ]
+
+  min_cc = 90
+  max_cc = 90
+
+  for math_inst in math_instructions:
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+    ]
+
+    if math_inst.instruction_shape[1] == 128:
+      tile_descriptions = [
+        # 128x128x128
+        TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
+      ]
+
+    # elif math_inst.instruction_shape[1] == 64:
+    #   tile_descriptions = [
+    #     # 256x64x128
+    #     TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+    #       0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
+    #   ]
+
+    else:
+      assert False, "math inst is not supported"
+
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+      schedules = [
+        # [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+        [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
+        # [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
+        [KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
+      ]
+      stream_k_schedules = [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
+    else:
+      schedules = [
+        # [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+        [KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
+      ]
+      stream_k_schedules = []
+
+    
+    for data_type in data_types:
+      # With No-SMEM epilogues
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
+
+      if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+        # Persistent kernels with TMA epilogues
+        # CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        #   [[KernelScheduleType.CpAsyncWarpSpecializedCooperative,    EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+
+        # Add stream-K variants (with and without TMA epilogues)
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
+        # CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        #   [[KernelScheduleType.CpAsyncWarpSpecializedCooperative,    EpilogueScheduleType.TmaWarpSpecializedCooperative]],
+        #   tile_schedulers=[TileSchedulerType.StreamK])

 #
 def GenerateSM90_TensorOp_1684(manifest, cuda_version):
@ -5488,9 +6016,13 @@ def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
 #
 def GenerateSM90(manifest, cuda_version):
  GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version)
  GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version)
  GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version)
  GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version)
  GenerateSM90_TensorOp_1684(manifest, cuda_version)
  GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
  GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
@ -5543,6 +6075,7 @@ def define_parser():
  parser.add_argument("--disable-full-archs-compilation", action="store_true", required=False, help="Disable compilation for every archs in --architectures")
  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
                      help='Logging level to be used by the generator script')
+  _add_package_disablement_flag(parser)
  return parser


--- a/python/cutlass_library/library.py
+++ b/python/cutlass_library/library.py
@ -400,6 +400,9 @@ ShortComplexLayoutNames = {
 class KernelScheduleType(enum.Enum):
  ScheduleAuto = enum_auto()
  Multistage = enum_auto()
+  CpAsyncWarpSpecialized = enum_auto()
+  CpAsyncWarpSpecializedPingpong = enum_auto()
+  CpAsyncWarpSpecializedCooperative = enum_auto()
  Tma = enum_auto()
  TmaWarpSpecialized = enum_auto()
  TmaWarpSpecializedPingpong = enum_auto()
@ -411,6 +414,9 @@ class KernelScheduleType(enum.Enum):
 KernelScheduleTag = {
  KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
  KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
+  KernelScheduleType.CpAsyncWarpSpecialized: 'cutlass::gemm::KernelCpAsyncWarpSpecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: 'cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: 'cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative',
  KernelScheduleType.Tma: 'cutlass::gemm::KernelTma',
  KernelScheduleType.TmaWarpSpecialized: 'cutlass::gemm::KernelTmaWarpSpecialized',
  KernelScheduleType.TmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpong',
@ -424,6 +430,9 @@ KernelScheduleTag = {
 KernelScheduleSuffixes = {
  KernelScheduleType.ScheduleAuto: '',
  KernelScheduleType.Multistage: '_cpasync',
+  KernelScheduleType.CpAsyncWarpSpecialized: '_cpasync_warpspecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: '_cpasync_warpspecialized_pingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: '_cpasync_warpspecialized_cooperative',
  KernelScheduleType.Tma: '_unspecialized',
  KernelScheduleType.TmaWarpSpecialized: '_warpspecialized',
  KernelScheduleType.TmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
@ -541,7 +550,6 @@ class OpcodeClass(enum.Enum):
  WmmaTensorOp = enum_auto()
  SparseTensorOp = enum_auto()

-
 OpcodeClassNames = {
  OpcodeClass.Simt: 'simt',
  OpcodeClass.TensorOp: 'tensorop',
@ -628,19 +636,20 @@ class GemmKind(enum.Enum):
  Sparse = enum_auto()
  Universal = enum_auto()
  Universal3x = enum_auto()
+  SparseUniversal3x = enum_auto()
  PlanarComplex = enum_auto()
  PlanarComplexArray = enum_auto()
  Grouped = enum_auto()
-
 #
 GemmKindNames = {
  GemmKind.Gemm: "gemm",
  GemmKind.Sparse: "spgemm",
  GemmKind.Universal: "gemm",
  GemmKind.Universal3x: "gemm",
+  GemmKind.SparseUniversal3x: "spgemm",
  GemmKind.PlanarComplex: "gemm_planar_complex",
  GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
-  GemmKind.Grouped: "gemm_grouped"
+  GemmKind.Grouped: "gemm_grouped",
 }

 #
@ -797,7 +806,7 @@ class GroupMode(enum.Enum):
  NoneGroup = enum_auto()         # dense conv (G=1)
  SingleGroup = enum_auto()       # grouped convolution (single group per CTA)
  MultipleGroup = enum_auto()     # grouped convolution ( multiple groups per CTA)
-  Depthwise = enum_auto()    # Depthwise convolution ( C=K=G )
+  Depthwise = enum_auto()         # Depthwise convolution ( C=K=G )

 #
 GroupModeTag = {
@ -818,14 +827,18 @@ GroupModeNames = {

 #
 class MathInstruction:
-  def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add):
+  def __init__(self, 
+      instruction_shape,                                            \
+      element_a, element_b, element_accumulator,                    \
+      opcode_class, math_operation = MathOperation.multiply_add     \
+    ): 
+
    self.instruction_shape = instruction_shape
    self.element_a = element_a
    self.element_b = element_b
    self.element_accumulator = element_accumulator
    self.opcode_class = opcode_class
    self.math_operation = math_operation
-
 #
 class TileDescription:

--- a/python/cutlass_library/manifest.py
+++ b/python/cutlass_library/manifest.py
@ -36,18 +36,31 @@ and building code
 """

 import enum
+import logging
 import os.path
 import shutil

-from cutlass_library.library import *
-from cutlass_library.gemm_operation import *
-from cutlass_library.rank_k_operation import *
-from cutlass_library.rank_2k_operation import *
-from cutlass_library.trmm_operation import *
-from cutlass_library.symm_operation import *
-from cutlass_library.conv2d_operation import *
-from cutlass_library.conv3d_operation import *
-import logging
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.gemm_operation import *
+  from cutlass_library.rank_k_operation import *
+  from cutlass_library.rank_2k_operation import *
+  from cutlass_library.trmm_operation import *
+  from cutlass_library.symm_operation import *
+  from cutlass_library.conv2d_operation import *
+  from cutlass_library.conv3d_operation import *
+except ImportError:
+  from library import *
+  from gemm_operation import *
+  from rank_k_operation import *
+  from rank_2k_operation import *
+  from trmm_operation import *
+  from symm_operation import *
+  from conv2d_operation import *
+  from conv3d_operation import *

 ###################################################################################################
 _LOGGER = logging.getLogger(__name__)
@ -380,7 +393,6 @@ class Manifest:

      architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
      architectures = [x if x != '90a' else '90' for x in architectures]
-
      self.compute_capabilities = [int(x) for x in architectures]

      if args.filter_by_cc in ['false', 'False', '0']:
--- a/python/cutlass_library/rank_2k_operation.py
+++ b/python/cutlass_library/rank_2k_operation.py
@ -35,12 +35,18 @@ Utilities for emitting Rank2K kernels
 """

 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil

-from cutlass_library.library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *


 ###################################################################################################
@ -82,7 +88,7 @@ class Rank2KOperation:
  #
  def is_mixed_input(self):
    return self.A.element != self.B.element
-  
+
  #
  def is_planar_complex(self):
    return False
@ -234,7 +240,7 @@ using Operation_${operation_name} =
 """
    self.rank_k_complex_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
  typename cutlass::gemm::device::Rank2K<
    ${element_a}, ${layout_a},
    ${element_b}, ${layout_b},
--- a/python/cutlass_library/rank_k_operation.py
+++ b/python/cutlass_library/rank_k_operation.py
@ -35,12 +35,18 @@ Utilities for emitting RankK kernels
 """

 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil

-from cutlass_library.library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *


 ###################################################################################################
@ -80,7 +86,7 @@ class RankKOperation:
  #
  def is_mixed_input(self):
    return False
-  
+
  #
  def is_planar_complex(self):
    return False
@ -259,7 +265,7 @@ using Operation_${operation_name} =
  def emit(self, operation):

    threadblock_shape = operation.tile_description.threadblock_shape
- 
+
    warp_count = operation.tile_description.warp_count
    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]

--- a/python/cutlass_library/symm_operation.py
+++ b/python/cutlass_library/symm_operation.py
@ -35,12 +35,18 @@ Utilities for emitting Symm kernels
 """

 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil

-from cutlass_library.library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *


 ###################################################################################################
@ -82,7 +88,7 @@ class SymmOperation:
  #
  def is_mixed_input(self):
    return self.A.element != self.B.element
-  
+
  #
  def is_planar_complex(self):
    return False
@ -241,7 +247,7 @@ using Operation_${operation_name} =
 // Symm operator ${operation_name}
 using Operation_${operation_name} =
  typename cutlass::gemm::device::Symm<
-    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode}, 
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
    ${element_b}, ${layout_b},
    ${element_c}, ${layout_c},
    ${element_accumulator},
--- a/python/cutlass_library/trmm_operation.py
+++ b/python/cutlass_library/trmm_operation.py
@ -35,12 +35,18 @@ Utilities for emitting Trmm kernels
 """

 import enum
-import os.path
-import shutil
 import functools
 import operator
+import os.path
+import shutil

-from cutlass_library.library import *
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *


 ###################################################################################################
@ -84,7 +90,7 @@ class TrmmOperation:
  #
  def is_mixed_input(self):
    return self.A.element != self.B.element
-  
+
  #
  def accumulator_type(self):
    accum = self.tile_description.math_instruction.element_accumulator