CUTLASS 3.1 (#915)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2023-04-14 20:19:34 -07:00
parent 9b8166e3f0
commit d572cc1aab
482 changed files with 37184 additions and 16419 deletions
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@ -25,7 +25,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+cmake_policy(SET CMP0112 NEW)
 include(GNUInstallDirs)

 find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
@ -94,6 +94,9 @@ file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOU
 # set cutlass generator compiler version to filter kernels in the generator not supported by a specific toolkit. 
 set(CUTLASS_GENERATOR_CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION})

+# --log-level is set to DEBUG to enable printing information about which kernels were excluded
+# from generation in /tools/library/scripts/manifest.py. To avoid having this information appear
+# in ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log, set this parameter to INFO
 execute_process(
  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/scripts/generator.py
@ -112,6 +115,8 @@ execute_process(
  ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log
 )

+message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.")
+
 if(NOT cutlass_lib_INSTANCE_GENERATION_RESULT EQUAL 0)
  message(FATAL_ERROR "Error generating library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log")
 endif()
--- a/tools/library/include/cutlass/library/arch_mappings.h
+++ b/tools/library/include/cutlass/library/arch_mappings.h
@ -102,6 +102,12 @@ template <typename OperatorClass> struct ArchMap<arch::Sm90, OperatorClass> {
  static int const kMax = 1024;
 };

+// Arch conditional WGMMA
+template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
+  static int const kMin = 90;
+  static int const kMax = 90;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////

 } // namespace library
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@ -178,7 +178,7 @@ public:
    int K,                                    /// GEMM K dimension

    NumericTypeID element_compute,            /// Data type of internal accumulation
-    
+
    NumericTypeID element_scalar,             /// Data type of alpha/beta scalars

    void const *alpha,                        /// Pointer to alpha scalar
@ -186,29 +186,29 @@ public:
    NumericTypeID element_A,                  /// Data type of A matrix elements
    LayoutTypeID layout_A,                    /// Layout of A matrix
    ComplexTransform transform_A,             /// Complex transformation applied to A matrix - ignored for real-valued matrices
-
    void const * ptr_A,                       /// Pointer to A matrix in Global Memory
-    int64_t lda,                                  /// Leading dimension of A matrix
+    int64_t lda,                              /// Leading dimension of A matrix

    NumericTypeID element_B,                  /// Data type of B matrix elements
    LayoutTypeID layout_B,                    /// Layout of B matrix
    ComplexTransform transform_B,             /// Complex transformation applied to B matrix - ignored for real-valued matrices
-
    void const * ptr_B,                       /// Pointer to B matrix in Global Memory
-    int64_t ldb,                                  /// Leading dimension of B matrix
+    int64_t ldb,                              /// Leading dimension of B matrix

    void const * beta,                        /// Pointer to beta scalar

-    NumericTypeID element_C,                  /// Data type of C and D matrices
-
+    NumericTypeID element_C,                  /// Data type of C matrix
+    LayoutTypeID layout_C,                    /// Layout of D matrix
    void const * ptr_C,                       /// Pointer to C matrix
-    int64_t ldc,                                  /// Leading dimension of C matrix
+    int64_t ldc,                              /// Leading dimension of C matrix

+    NumericTypeID element_D,                  /// Data type of D matrix
+    LayoutTypeID layout_D,                    /// Layout of D matrix
    void * ptr_D,                             /// Pointer to D matrix
-    int64_t ldd,                                  /// Leading dimension of D matrix
-   
+    int64_t ldd,                              /// Leading dimension of D matrix
+
    int batch_count = 1,                      /// Batch count or number of split-K slices
- 
+
    int64_t batch_stride_A = 0,               /// Batch stride of A operand
    int64_t batch_stride_B = 0,               /// Batch stride of B operand
    int64_t batch_stride_C = 0,               /// Batch stride of C operand
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@ -114,6 +114,8 @@ enum class NumericTypeID {
  kS16,
  kS32,
  kS64,
+  kFE4M3,
+  kFE5M2,
  kF16,
  kBF16, 
  kTF32,
@ -474,9 +476,12 @@ struct GemmDescription : public OperationDescription {
  /// Describes the B operand
  TensorDescription B;

-  /// Describes the source and destination matrices
+  /// Describes the source matrix
  TensorDescription C;

+  /// Describes the destination matrix
+  TensorDescription D;
+
  /// Describes the sparse meta matrices
  TensorDescription E;

@ -501,6 +506,7 @@ struct GemmDescription : public OperationDescription {
    TensorDescription const &A = TensorDescription(),
    TensorDescription const &B = TensorDescription(),
    TensorDescription const &C = TensorDescription(),
+    TensorDescription const &D = TensorDescription(),
    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
    SplitKMode split_k_mode = SplitKMode::kNone,
    ComplexTransform transform_A = ComplexTransform::kNone,
@ -510,6 +516,7 @@ struct GemmDescription : public OperationDescription {
    A(A),
    B(B),
    C(C),
+    D(D),
    element_epilogue(element_epilogue),
    split_k_mode(split_k_mode),
    transform_A(transform_A),
@ -527,13 +534,14 @@ struct SparseGemmDescription : public GemmDescription {
    TensorDescription const &A = TensorDescription(),
    TensorDescription const &B = TensorDescription(),
    TensorDescription const &C = TensorDescription(),
+    TensorDescription const &D = TensorDescription(),
    TensorDescription const &E = TensorDescription(),
    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
    SplitKMode split_k_mode = SplitKMode::kNone,
    ComplexTransform transform_A = ComplexTransform::kNone,
    ComplexTransform transform_B = ComplexTransform::kNone
  ):
-    GemmDescription(gemm_kind, A, B, C, element_epilogue, split_k_mode, transform_A, transform_B)
+    GemmDescription(gemm_kind, A, B, C, D, element_epilogue, split_k_mode, transform_A, transform_B)
     {this->E = E;}
 };

@ -1019,6 +1027,9 @@ struct GemmUniversalArguments {
  int64_t batch_stride_B;
  int64_t batch_stride_C;
  int64_t batch_stride_D;
+
+  // Needed for some 3.x kernels
+  int sm_count;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/include/cutlass/library/operation_table.h
+++ b/tools/library/include/cutlass/library/operation_table.h
@ -66,6 +66,9 @@ struct GemmFunctionalKey {
  LayoutTypeID layout_B;
  ComplexTransform transform_B;
  NumericTypeID element_C;
+  LayoutTypeID layout_C;
+  NumericTypeID element_D;
+  LayoutTypeID layout_D;

  //
  // Methods
@ -83,7 +86,10 @@ struct GemmFunctionalKey {
    NumericTypeID element_B = NumericTypeID::kF16,
    LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
    ComplexTransform transform_B = ComplexTransform::kNone,
-    NumericTypeID element_C = NumericTypeID::kF16
+    NumericTypeID element_C = NumericTypeID::kF16,
+    LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_D = NumericTypeID::kF16,
+    LayoutTypeID layout_D = LayoutTypeID::kColumnMajor
  ):
    provider(provider),
    gemm_kind(gemm_kind),
@ -95,7 +101,10 @@ struct GemmFunctionalKey {
    element_B(element_B),
    layout_B(layout_B),
    transform_B(transform_B),
-    element_C(element_C)
+    element_C(element_C),
+    layout_C(layout_C),
+    element_D(element_D),
+    layout_D(layout_D)
  { }

  inline
@ -111,7 +120,10 @@ struct GemmFunctionalKey {
      (element_B == rhs.element_B) &&
      (layout_B == rhs.layout_B) &&
      (transform_B == rhs.transform_B) &&
-      (element_C == rhs.element_C);
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_D == rhs.element_D) &&
+      (layout_D == rhs.layout_D);
  }

  inline
@ -137,6 +149,9 @@ std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey
    << "         layout_B: " << to_string(k.layout_B) << "\n"
    << "      transform_B: " << to_string(k.transform_B) << "\n"
    << "        element_C: " << to_string(k.element_C) << "\n"
+    << "         layout_C: " << to_string(k.layout_C) << "\n"
+    << "        element_D: " << to_string(k.element_D) << "\n"
+    << "         layout_D: " << to_string(k.layout_D) << "\n"
    << "}";

  return out;
@ -157,18 +172,21 @@ struct GemmFunctionalKeyHasher {
  size_t operator()(GemmFunctionalKey const &key) const {
    IntHash hash;

-    return 
-      rotl(hash(int(key.provider)), 1) ^ 
-      rotl(hash(int(key.gemm_kind)), 2) ^ 
+    return
+      rotl(hash(int(key.provider)),        1) ^ 
+      rotl(hash(int(key.gemm_kind)),       2) ^ 
      rotl(hash(int(key.element_compute)), 3) ^
-      rotl(hash(int(key.element_scalar)), 4) ^
-      rotl(hash(int(key.element_A)), 5) ^
-      rotl(hash(int(key.layout_A)), 6) ^
-      rotl(hash(int(key.transform_A)), 7) ^
-      rotl(hash(int(key.element_B)), 8) ^
-      rotl(hash(int(key.layout_B)), 9) ^
-      rotl(hash(int(key.transform_B)), 10) ^
-      rotl(hash(int(key.element_C)), 11);
+      rotl(hash(int(key.element_scalar)),  4) ^
+      rotl(hash(int(key.element_A)),       5) ^
+      rotl(hash(int(key.layout_A)),        6) ^
+      rotl(hash(int(key.transform_A)),     7) ^
+      rotl(hash(int(key.element_B)),       8) ^
+      rotl(hash(int(key.layout_B)),        9) ^
+      rotl(hash(int(key.transform_B)),    10) ^
+      rotl(hash(int(key.element_C)),      11) ^
+      rotl(hash(int(key.layout_C)),       12) ^
+      rotl(hash(int(key.element_D)),      13) ^
+      rotl(hash(int(key.layout_D)),       14);
  }
 };

--- a/tools/library/scripts/gemm_operation.py
+++ b/tools/library/scripts/gemm_operation.py
@ -23,7 +23,8 @@ from library import *
 class GemmOperation:
  #
  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
-      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None, 
+      kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto):

    self.prefix = "3x" if gemm_kind == GemmKind.Universal3x else ""
    self.operation_kind = OperationKind.Gemm
@ -33,6 +34,15 @@ class GemmOperation:
    self.A = A
    self.B = B
    self.C = C
+    self.D = D 
+    if self.D == None:
+      self.D = self.C
+
+    if gemm_kind != GemmKind.Universal3x:
+      assert(kernel_schedule == KernelScheduleType.ScheduleAuto)
+      assert(epilogue_schedule == EpilogueScheduleType.ScheduleAuto)
+    self.kernel_schedule = kernel_schedule
+    self.epilogue_schedule = epilogue_schedule
    self.element_epilogue = element_epilogue
    self.epilogue_functor = epilogue_functor
    self.swizzling_functor = swizzling_functor
@ -122,11 +132,12 @@ class GemmOperation:

  def extended_name_3x(self):
    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
-    extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}".format(
+    extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
      element_a = DataTypeNames[self.A.element],
      element_b = DataTypeNames[self.B.element],
      element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator],
      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element],
      core_name = self.core_name())
    return extended_name

@ -152,12 +163,20 @@ class GemmOperation:
        ShortLayoutTypeNames[self.B.layout],
        ShortLayoutTypeNames[self.C.layout])

+  # Generates a short string representing underlying kernel schedule type
+  def kernel_schedule_name_3x(self):
+    return KernelScheduleSuffixes[self.kernel_schedule]
+
+  # Generates a short string representing underlying epilogue schedule type
+  def epilogue_schedule_name_3x(self):
+    return EpilogueScheduleSuffixes[self.epilogue_schedule]
+
  # Generates the full kernel function name
  def procedural_name(self):
    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
    if self.arch >= 90:
-      kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}"
+      kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}{k}{e}"
      return kernel_name_template.format(
          p = self.prefix,
          ar = self.arch,
@ -171,7 +190,9 @@ class GemmOperation:
          ck = self.tile_description.cluster_shape[2],
          l = self.tile_description.stages,
          s = self.layout_name_3x(),
-          al = str(max(self.A.alignment, self.B.alignment)))
+          al = str(max(self.A.alignment, self.B.alignment)),
+          k = self.kernel_schedule_name_3x(),
+          e = self.epilogue_schedule_name_3x())
    else:
      threadblock = self.tile_description.procedural_name()
      return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
@ -604,8 +625,7 @@ class EmitGemmUniversal3xInstance:
      "cutlass/numeric_types.h",
      "cutlass/gemm/kernel/gemm_universal.hpp",
      "cutlass/gemm/collective/collective_builder.hpp",
-      "cutlass/epilogue/collective/default_epilogue.hpp",
-      "cutlass/epilogue/thread/linear_combination.h",
+      "cutlass/epilogue/collective/collective_builder.hpp",
    ]
    self.builtin_epilogue_functor_template = """
    ${epilogue_functor}<
@ -617,6 +637,18 @@ class EmitGemmUniversal3xInstance:
 """
    self.gemm_template = """

+using ${operation_name}_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule}
+  >::CollectiveOp;
+
 using ${operation_name}_mainloop =
  typename cutlass::gemm::collective::CollectiveBuilder<
    ${arch}, ${opcode_class},
@ -625,18 +657,11 @@ using ${operation_name}_mainloop =
    ${element_accumulator},
    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
-    cutlass::gemm::collective::StageCountAuto,
-    cutlass::gemm::collective::KernelScheduleAuto
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      sizeof(typename ${operation_name}_epilogue::SharedStorage)>,
+  ${kernel_schedule}
  >::CollectiveOp;

-using ${operation_name}_epilogue =
-  cutlass::epilogue::collective::DefaultEpilogue<
-    cutlass::gemm::TagToStrideC_t<${layout_c}>,
-    cutlass::gemm::TagToStrideC_t<${layout_c}>,
-    cutlass::epilogue::thread::LinearCombination<
-      ${element_c}, ${epilogue_vector_length}, ${element_accumulator}, ${element_epilogue}>
-  >;
-
 // Gemm operator ${operation_name}
 using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
    cute::Shape<int,int,int,int>,
@ -670,8 +695,8 @@ ${compile_guard_end}
      stage_count_string = "cutlass::gemm::collective::StageCountAuto"
    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]

-    instance_layout_A, instance_layout_B, instance_layout_C = \
-      (operation.A.layout, operation.B.layout, operation.C.layout)
+    instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
+      (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)

    # 3.0 profiler integration only supports trivial epilogues for now
    epilogue_vector_length = 1
@ -697,6 +722,8 @@ ${compile_guard_end}
      'layout_b': LayoutTag[instance_layout_B],
      'element_c': DataTypeTag[operation.C.element],
      'layout_c': LayoutTag[instance_layout_C],
+      'element_d': DataTypeTag[operation.D.element],
+      'layout_d': LayoutTag[instance_layout_D],
      'element_accumulator': DataTypeTag[operation.accumulator_type()],
      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
      'arch': "cutlass::arch::Sm%d" % operation.arch,
@ -712,10 +739,14 @@ ${compile_guard_end}
      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'kernel_schedule' : str(KernelScheduleTag[operation.kernel_schedule]),
+      'epilogue_schedule' : str(EpilogueScheduleTag[operation.epilogue_schedule]),
      'epilogue_functor': epilogue_functor,
      'stages': stage_count_string,
      'align_a': str(operation.A.alignment),
      'align_b': str(operation.B.alignment),
+      'align_c': str(operation.C.alignment),
+      'align_d': str(operation.C.alignment),
      'transform_a': ComplexTransformTag[operation.A.complex_transform],
      'transform_b': ComplexTransformTag[operation.B.complex_transform],
      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@ -361,6 +361,58 @@ ShortComplexLayoutNames = {
  (LayoutType.RowMajor, ComplexTransform.conj): 'h'
 }

+###################################################################################################
+class KernelScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  Multistage = enum_auto()
+  Tma = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedPingpong = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+#
+KernelScheduleTag = {
+  KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
+  KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
+  KernelScheduleType.Tma: 'cutlass::gemm::KernelTma',
+  KernelScheduleType.TmaWarpSpecialized: 'cutlass::gemm::KernelTmaWarpSpecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperative',
+}
+
+#
+KernelScheduleSuffixes = {
+  KernelScheduleType.ScheduleAuto: '',
+  KernelScheduleType.Multistage: '_cpasync',
+  KernelScheduleType.Tma: '_unspecialized',
+  KernelScheduleType.TmaWarpSpecialized: '_warpspecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+}
+
+class EpilogueScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  EpilogueTransposed = enum_auto()
+  NoSmemWarpSpecialized = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+#
+EpilogueScheduleTag = {
+  EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
+  EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
+  EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
+}
+
+#
+EpilogueScheduleSuffixes = {
+  EpilogueScheduleType.ScheduleAuto: '',
+  EpilogueScheduleType.EpilogueTransposed: '',
+  EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
+}
+
 ###################################################################################################

 #
--- a/tools/library/scripts/pycutlass/README.md
+++ b/tools/library/scripts/pycutlass/README.md
@ -1,143 +0,0 @@
-# PyCUTLASS: CUTLASS Python Interface
-
-PyCUTLASS is a python interface of CUTLASS C++ template library. PyCUTLASS takes user-defined operation descriptions, emits C++ code, and compiles it with `nvcc` or `nvrtc`. It also provides wrappers for user-provide arguments from [numpy](https://numpy.org/), [torch](https://pytorch.org/), and [cupy](https://github.com/cupy/cupy) and encode them to kernel's parameters.
-
-```python
-import pycutlass
-from pycutlass import *
-import torch
-
-pycutlass.get_memory_pool(2**8, 2**32)
-
-math_inst = MathInstruction(
-    [1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
-    cutlass.OpClass.Simt, MathOperation.multiply_add
-)
-
-tile_description = TileDescription(
-    [128, 128, 8], 4, [2, 4, 1],
-    math_inst
-)
-
-A = TensorDescription(
-    cutlass.float32, cutlass.RowMajor, 1
-)
-
-B = TensorDescription(
-    cutlass.float32, cutlass.RowMajor, 1
-)
-
-C = TensorDescription(
-    cutlass.float32, cutlass.RowMajor, 1
-)
-
-epilogue_functor = LinearCombination(cutlass.float32, 1, cutlass.float32, cutlass.float32)
-
-operation = GemmOperationUniversal(
-    arch=80, tile_description=tile_description,
-    A=A, B=B, C=C, 
-    epilogue_functor=epilogue_functor, 
-    swizzling_functor=cutlass.IdentitySwizzle1
-)
-
-pycutlass.compiler.add_module([operation,])
-
-problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
-
-tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
-tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
-tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
-tensor_D = torch.empty_like(tensor_C)
-
-
-alpha = 1.0
-beta = 0.0
-
-arguments = GemmArguments(
-    operation=operation, problem_size=problem_size,
-    A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
-    output_op=operation.epilogue_type(alpha, beta),
-    gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
-)
-
-operation.run(arguments)
-
-arguments.sync()
-
-tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
-
-assert torch.equal(tensor_D, tensor_D_ref)
-```
-PyCUTLASS also provides infrastructures for profiling, compiled artifact management, and pool memory manager 
-
-## Supported Features
-PyCUTLASS currently supports following operations:
-* GEMM with mode {Serial, Parallel Split K, Batched GEMM, Array GEMM}, op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {RowMajor, ColumnMajor, Row/ColumnMajorInterleaved<32> for int8}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, swizzling functions {IdentitySwizzle<1,2,4,8>, HorizontalSwizzle, BatchedIdentitySwizzle}, and epilogue {LinearCombination, LinearCombinationClamp}
-* GEMM grouped with op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {RowMajor, ColumnMajor}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, scheduling mode {Host, Device}, and epilogue {LinearCombination, LinearCombinationClamp}.
-* Conv2d with {Fprop, Dgrad, Wgrad}, op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {Tensor NHWC, TensorNC32HW32 and TensorC32RSK for int8}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, split-k mode {Parallel, Serial}, and epilogue {LinearCombination, LinearCombinationClamp}
-
-The tiling size of above operations can also be customized.
-
-## Installation
-
-### Using Docker
-We recommend using one of our provided Docker images for using PyCUTLASS.
-
-**To run CUTLASS 3 GEMM kernels targeting the NVIDIA Hopper architecture via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda12.0) based on the NGC CUDA 12.0 container:
-```shell
-docker build -t pycutlass-cuda12.0:latest -f docker/Dockerfile-cuda12.0 .
-docker run --gpus all -it --rm pycutlass-cuda12.0:latest
-```
-Note that this Docker container does not include CuPy or PyTorch, and, thus, will not be able to run PyCUTLASS examples that
-leverage these packages.
-
-**To run CUTLASS 2.x kernels targeting pre-SM90 architectures via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda11.8-pytorch) based on an NGC PyTorch container:
-```shell
-docker build -t pycutlass-cuda11.8-pytorch:latest -f docker/Dockerfile-cuda11.8-pytorch .
-docker run --gpus all -it --rm pycutlass-cuda11.8-pytorch:latest
-```
-
-### Environment variables
-PyCUTLASS requires two environment variables:
-* `CUTLASS_PATH`: the root directory of CUTLASS. You can set this from the location at which you cloned CUTLASS via: `export CUTLASS_PATH=$(pwd)`.
-* `CUDA_INSTALL_PATH`: the directory where cuda toolkit is installed. If running in bash with `nvcc` installed under a CUDA toolkit, you can set this to the location of your `nvcc` installation via: `export CUDA_INSTALL_PATH=$(which nvcc | awk -F'/bin/nvcc' '{print $1}')`
-
-After setting these two environment variables, PyCUTLASS can be installed with 
-```shell
-cd $CUTLASS_PATH/tools/library/scripts/pycutlass && bash build.sh
-```
-
-## Examples
-Examples can be found in [$CUTLASS_PATH/examples/40_cutlass_py](examples/40_cutlass_py)
-
-## Test
-The test cases are listed in `$CUTLASS_PATH//tools/library/scripts/pycutlass/test`. The unit test can be run with
-```shell
-# Each of these tests are only supported on devices with compute capability of SM80. For other devices,
-# see the basic examples in $CUTLASS_PATH/examples/40_cutlass_py
-cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/unit && python test_sm80.py
-cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/example && bash run_all_example.sh
-```
-
-## build documentation
-Run
-```shell
-bash build_doc.sh
-```
-
-
-## Troubleshooting
-
-### Issue 1: permission denied
-Building PyCUTLASS requires installing dependencies to python. So conda could an option if you don't have permission.
-
-### Issue 2: rmm: module not found
-PyCUTLASS manages the device memory with [RMM](https://github.com/rapidsai/rmm). Our `build.sh` automatically pull the [rmm branch-22.08](https://github.com/rapidsai/rmm/tree/branch-22.08) from github and build it from source. The rmm is allocated at `$CUTLASS_PATH/tools/library/scripts/pycutlass/rmm`. It requires `cmake > 3.20.1`. If the build fails, it can be manually fixed with the following steps:
-```shell
-cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm && ./build.sh librmm rmm
-
-cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm/python
-python setup.py build_ext --inplace
-python setup.py install
-```
-To test whether rmm is successfully installed, try `import rmm`. For other issues related to rmm, please check https://github.com/rapidsai/rmm/issues. 
--- a/tools/library/scripts/pycutlass/build.sh
+++ b/tools/library/scripts/pycutlass/build.sh
@ -1,36 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-pip install -U pybind11
-git clone https://github.com/google/googletest.git
-python setup.py develop --user
-python setup.py rmm
--- a/tools/library/scripts/pycutlass/build_doc.sh
+++ b/tools/library/scripts/pycutlass/build_doc.sh
@ -1,36 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-pip install enum-tools
-pip install sphinx-toolbox
-pip install m2r2
-sphinx-build -b html docs/source/ docs/build/html
--- a/tools/library/scripts/pycutlass/docker/Dockerfile-cuda11.8-pytorch
+++ b/tools/library/scripts/pycutlass/docker/Dockerfile-cuda11.8-pytorch
@ -1,40 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-FROM nvcr.io/nvidia/pytorch:22.11-py3
-
-RUN chmod ugo+rwx /home
-RUN pip uninstall -y rmm
-RUN pip install rmm-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-ENV LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
-ENV CUDA_INSTALL_PATH=/usr/local/cuda
--- a/tools/library/scripts/pycutlass/docker/Dockerfile-cuda12.0
+++ b/tools/library/scripts/pycutlass/docker/Dockerfile-cuda12.0
@ -1,46 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-FROM nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu20.04
-
-RUN apt-get update
-RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
-RUN apt-get install -y git cmake vim python3 python3-pip
-RUN ln -s /usr/bin/python3 /usr/bin/python
-RUN chmod ugo+rwx /home
-RUN pip install numpy==1.23
-RUN pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
-RUN pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
-RUN pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
-ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu/:$LIBRARY_PATH
-ENV CUDA_INSTALL_PATH=/usr/local/cuda
--- a/tools/library/scripts/pycutlass/docs/Makefile
+++ b/tools/library/scripts/pycutlass/docs/Makefile
@ -1,52 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/tools/library/scripts/pycutlass/docs/make.bat
+++ b/tools/library/scripts/pycutlass/docs/make.bat
@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
--- a/tools/library/scripts/pycutlass/docs/source/conf.py
+++ b/tools/library/scripts/pycutlass/docs/source/conf.py
@ -1,96 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = 'PyCutlass'
-copyright = '2022, Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
-author = 'Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.duration',
-    'sphinx.ext.doctest',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.intersphinx',
-    'enum_tools.autoenum',
-    'sphinx.ext.autosummary',
-    'm2r2'
-]
-
-source_suffix = [".rst", ".md"]
-
-autosummary_generate = True
-autosummary_imported_members = True
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'bizstyle'
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
--- a/tools/library/scripts/pycutlass/docs/source/conv2d_op.rst
+++ b/tools/library/scripts/pycutlass/docs/source/conv2d_op.rst
@ -1,13 +0,0 @@
-CONV2D Operation
-================
-
-.. autoclass:: pycutlass.Conv2dOperation
-    :special-members:
-    :members: run
-    :exclude-members: __weakref__, configuration_name, core_name, extended_name, procedural_name
-
-.. autoclass:: pycutlass.Conv2dArguments
-    :special-members:
-    :members:
-    :exclude-members: initialize
-    :show-inheritance:
--- a/tools/library/scripts/pycutlass/docs/source/cutlass.rst
+++ b/tools/library/scripts/pycutlass/docs/source/cutlass.rst
@ -1,100 +0,0 @@
-cutlass
-=======
-
-.. rubric:: Operator Classification
-
-.. autoclass:: cutlass.OpClass
-    :members:
-
-.. rubric:: GEMM Layout
-
-.. autoclass:: cutlass.RowMajor
-    :members:
-
-.. autoclass:: cutlass.ColumnMajor
-    :members:
-
-.. autoclass:: cutlass.RowMajorInterleaved32
-    :members:
-
-.. autoclass:: cutlass.ColumnMajorInterleaved32
-    :members:
-
-.. rubric:: Conv Layout
-
-.. autoclass:: cutlass.TensorNHWC
-    :members:
-
-.. autoclass:: cutlass.TensorNC32HW32
-    :members:
-
-.. autoclass:: cutlass.TensorC32RSK32
-    :members:
-
-.. rubric:: Threadblock Swizzle
-
-.. autoclass:: cutlass.dim3
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.IdentitySwizzle1
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.IdentitySwizzle2
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.IdentitySwizzle4
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.IdentitySwizzle8
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.HorizontalSwizzle
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.BatchedIdentitySwizzle
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.StridedDgradIdentitySwizzle1
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.StridedDgradIdentitySwizzle4
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.StridedDgradHorizontalSwizzle
-    :special-members:
-    :members:
-
-.. rubric:: Coordinates
-
-.. autoclass:: cutlass.Tensor4DCoord
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.Tensor3DCoord
-    :special-members:
-    :members:
-
-.. autoclass:: cutlass.MatrixCoord
-    :special-members:
-    :members:
-
-
-.. rubric:: Convolution
-
-.. autoclass:: cutlass.conv.Operator
-    :members:
-
-.. autoclass:: cutlass.conv.IteratorAlgorithm
-    :members:
-
-.. autoclass:: cutlass.conv.StrideSupport
-    :members:
--- a/tools/library/scripts/pycutlass/docs/source/gemm_op.rst
+++ b/tools/library/scripts/pycutlass/docs/source/gemm_op.rst
@ -1,18 +0,0 @@
-GEMM Operation
-==============
-
-.. autoclass:: pycutlass.GemmOperationUniversal
-    :special-members: 
-    :members:
-
-.. autoclass:: pycutlass.GemmOperationGrouped
-    :special-members:
-    :members:
-
-.. autoclass:: pycutlass.GemmArguments
-    :special-members:
-    :members:
-
-.. autoclass:: pycutlass.GemmGroupedArguments
-    :special-members:
-    :members:
--- a/tools/library/scripts/pycutlass/docs/source/index.rst
+++ b/tools/library/scripts/pycutlass/docs/source/index.rst
@ -1,31 +0,0 @@
-.. PyCutlass documentation master file, created by
-   sphinx-quickstart on Sun Jun 19 12:05:42 2022.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-CUTLASS Python Project Documentation
-=====================================
-.. mdinclude:: ../../README.md
-   
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-
-
-.. Indices and tables
-.. ==================
-
-.. * :ref:`genindex`
-.. * :ref:`modindex`
-.. * :ref:`search`
-
-
-Indices
-==================
-.. toctree::
-   user_guide
-   visitor_tree
-   gemm_op
-   conv2d_op
-   cutlass
--- a/tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md
+++ b/tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md
@ -1,225 +0,0 @@
-# Epilogue Visitor Tree
-The Epilogue Visitor Tree is an experimental feature that directly generates epilogues from user-provide python functions.
-
-## Usage
-
-The Epilogue Visitor tree support many different operations. 
-
-### Unary functions
-Epilogue Visitor Tree supports unary functions like activation functions. For example,
-```python
-class UnaryEpilogue_(EpilogueVisitTree):
-    def __call__(
-        self, accum: 'tensor', c: 'tensor', 
-        alpha: 'scalar', beta: 'scalar'):
-        #
-        T = leaky_relu.numpy(accum, 0.2)
-        Z = alpha * T + beta * c
-        return Z
-epilogue_functor = UnaryEpilogue_(
-    epilogue_functor, tile_description, math_inst.element_accumulator, 
-    C.alignment, element_epilogue, C.element)
-```
-
-### Broadcast Operation
-Epilogue Visitor Tree supports broadcasting row and column vectors to the whole output matrix. To use broadcast, you just need to specify whether the source vector is a `row` vector or a `column` vector. Here is an example.
-```python
-class ColumnBroadcast_(EpilogueVisitTree):
-    def __call__(
-        self, accum: 'tensor',  c: 'tensor', 
-        vector: 'column', alpha: 'scalar', beta: 'scalar'):
-        #
-        T = accum + vector
-        scale_T = leaky_relu.numpy(alpha * T, 0.2)
-        Z = scale_T + beta * c
-        return Z, T
-epilogue_functor = ColumnBroadcast_(
-    epilogue_functor, tile_description, math_inst.element_accumulator, 
-    C.alignment, element_epilogue, C.element)
-```
-
-### Reduction Operation
-
-Epilogue Visitor Tree also supports row and column-wise reduction in each threadblock tile. The syntax for reduction is
-```python
-{reduction_output} = reduction_op({input_tensor}, {row|column}, {Add}, {threadblock_shape.n|threadblock_shape.m})
-```
-The `{row|column}` indicates whether the `row` vectors are reduced or the `column` vectors are reduction. The `{Add}` specifies the reduction operation. The `{threadblock_shape.n|threadblock_shape.m}` are the reduction lengths.
-
-**Constraint**
-* The `{input_tensor}` can only be the name of source or intermediate result. `reduction_op(A + B, ...)` will not work, please use `C = A + B`, `reduction_op(C, ...)` instead.
-* The `{reduction_output}` cannot be used in the epilogue. It will be directly written to global memory after the reduction is done.
-```python
-class RowReduction_(EpilogueVisitTree):
-    def __call__(
-        self, accum: 'tensor',  c: 'tensor', 
-        alpha: 'scalar', beta: 'scalar'):
-        #
-        D = alpha * accum + tanh.numpy(beta * c)
-        reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
-        return D, reduction
-epilogue_functor = RowReduction_(
-    epilogue_functor, tile_description, math_inst.element_accumulator, 
-    C.alignment, element_epilogue, C.element)
-epilogue_functor.initialize()
-```
-
-## Get output_op
-
-As shown in the user guide, an `output_op` is required by the argument wrapper. We will take the `RowReduction_` as an example to show how to get `output_op`.
-```python
-class RowReduction_(EpilogueVisitTree):
-    def __call__(
-        self, accum: 'tensor',  c: 'tensor', 
-        alpha: 'scalar', beta: 'scalar'):
-        #
-        D = alpha * accum + tanh.numpy(beta * c)
-        reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
-        return D, reduction
-epilogue_functor = RowReduction_(
-    epilogue_functor, tile_description, math_inst.element_accumulator, 
-    C.alignment, element_epilogue, C.element)
-epilogue_functor.initialize()
-
-cta_n = args.threadblock_shape[1]
-num_cta_n = (problem_size.n() + cta_n - 1) // cta_n
-reduction = np.zeros(shape=(args.batch * problem_size.m() * num_cta_n,), dtype=getattr(np, element_c))
-# get output op
-output_op = operation.epilogue_type(
-    D=tensor_D, alpha=args.alpha, beta=args.beta, c=tensor_C, reduction=reduction, problem_size=[problem_size.m(), problem_size.n()]
-)
-```
-Like other epilogue functors such as `LinearCombination`, the output op for EpilogueVisitorTree is also created with `operation.epilogue_type(*)`. However, there are two differences:
-* The arguments need to be passed as keyword-arguments. The keywords are the argument names in `def __call__`.
-* An additional `problem_size=[problem_size.m(), problem_size.n()]` is required. 
-
-
-## Add new Unary Operation (e.g. Activation Function)
-To add additional unary operation into epilogue visitor tree, a new unary op
-should be created for `VisitorOpUnary`. We will take `tanh` as an example.
-
-### Step 1: define TanhVisitor
-
-The visitor defines the parameters and computation required by the unary option.
-The unary operations are registered in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h). But you can define it in any header file and include the header file in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h).
-
-
-* Two template arguments are required:
-    * `T`: data type used to compute the unary operation
-    * `N`: compute fragment length
-* We also need to provide the `Arguments` and `Params` structures. The `Arguments` will be assembled by [ctypes](https://docs.python.org/3/library/ctypes.html), the `Params` will be generated from `Arguments` automatically. If the unary function takes no argument, an integer like `int tmp` can be provide to ensure the correctness of ctypes.
-* The constructor can only take the `params` as the single argument.
-* The operation is defined in `Array<T, N> operator()(Array<T, N> const &frag) const `. On common way to do that is first define a scalar computation, and them use it for the fragment computation with an unrolled for-loop.
-* A guard function is required. If it returns `true`, it will disable all the children nodes of the unary node and return zeros to parent node. This is very helpful for multiplication with scalar while the scalar is `0`. For general cases, you can just return `true`. 
-```c++
-// T: data type used to compute the unary operation
-// N: compute fragment length
-template <typename T, int N>
-struct TanhVisitor {
-    /// Argument
-    struct Arguments {
-        // a placeholder argument to ensure correctness of ctypes
-        int tmp;
-
-        CUTLASS_HOST_DEVICE
-        Arguments(): tmp(0) { };
-
-        CUTLASS_HOST_DEVICE
-        Arguments(int tmp): tmp(tmp) { };
-    };
-
-    /// Param
-    struct Params {
-        CUTLASS_HOST_DEVICE
-        Params(){ };
-        Params(Arguments const &args) { }
-    };
-
-    /// Constructor
-    CUTLASS_HOST_DEVICE
-    TanhVisitor(Params const &params) { }
-
-    // scalar operator
-    CUTLASS_HOST_DEVICE
-    T tanh_op(T const &scalar) const {
-        return fast_tanh(scalar);
-    }
-
-    /// vector operator
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &frag) const {
-        Array<T, N> y;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i=0; i < N; ++i) {
-            y[i] = tanh_op(frag[i]);
-        }
-
-        return y;
-    }
-
-    // Guard
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return true;
-    }
-};
-```
-
-### Step 2: register Tanh function
-After defining the function in C++, we need to register it in python. The class below gives an example.
-* The init function takes the data type `element_compute`, which will be the `T` in the C++ template.
-In the init function, we also generate the `_Arguments` class as a `ctypes.Structure`. It includes all the data members in the `TanhVisitor::Arguments`.
-* The `_Arguments` need to be registered as `self.argument_type` of `tanh` class. 
-* A `emit` function is required to emit the namespace and typename of `TanhVisitor`.
-* A staticmethod as numpy reference is required to implement the python code to parse.
-
-The built-in functions are defined in [pycutlass/src/pycutlass/epilogue.py](tools/library/scripts/pycutlass/src/pycutlass/epilogue.py). You can defined yours in any file as long as it can be found by [/pycutlass/src/pycutlass/parser.py](tools/library/scripts/pycutlass/src/pycutlass/parser.py).
-```python
-class tanh(ActivationFunctor):
-    def __init__(self, element_compute) -> None:
-        super().__init__()
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("tmp", ctypes.c_int)
-            ]
-            def __init__(self, *args) -> None:
-                self.tmp = 0
-        self.argument_type = _Arguments
-    
-    def emit(self):
-        return "cutlass::TanhVisitor"
-
-    @staticmethod
-    def numpy(x: np.ndarray):
-        return np.tanh(x)
-```
-
-### Step 3: Run the function
-Now the new unary op is ready to use. An epilogue visitor tree can be built with
-```python
-class RowReduction_(EpilogueVisitTree):
-    def __call__(
-        self, accum: NDArray['tensor', 'float32'],  c: NDArray['tensor', 'float32'], 
-        alpha: 'float32', beta: 'float32'):
-        #
-        D = alpha * accum + tanh.numpy(beta * c)
-        reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
-        return D, reduction
-epilogue_functor = RowReduction_(
-    epilogue_functor, tile_description, math_inst.element_accumulator, 
-    C.alignment, element_epilogue, C.element)
-epilogue_functor.initialize()
-```
-
-## Limitations and Future work
-
-Although the Epilogue Visitor Tree brings great flexibility to epilogue construction, as the epilogue is formulated as a single tree, there are several limitations.
-* [Future Work] Serial and Parallel Split-K GEMM are not supported yet. 
-    * To support serial split-k, additional tree transformation pass is required to inject a `binaryOpNode(Add)` + `TensorInputNode` before each `TensorOutputNode` to fetch the partial sum back. The `semaphore` also needs to be passed into epilogue. 
-    * To support parallel split-k, an Reduction with visitor kernel is required.
-* [Future Work] Convolution and GEMM Grouped are not supported yet.
-    * To support Conv2d and GEMM Grouped, corresponding *_with_visitor kernels are required.
-
-* [Limitation] If the same node is used by two operations (except that one of them is reduction), the node and all its offsprings will be executed twice.
-* [Limitation] The result of reduction can only be used as the return value.
--- a/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md
+++ b/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md
@ -1,283 +0,0 @@
-# Basics of PyCUTLASS
-
-PyCUTLASS handles the following things when launch the CUTLASS kernels
-* Memory management
-* Operation Description
-* Code emission and compilation
-* Arguments preprocessing
-* Kernel launching
-* Result Synchronization
-
-## Memory management
-
-PyCUTLASS uses [RMM](https://github.com/rapidsai/rmm) to manage device memory. At the beginning of the program, call
-```python
-pycutlass.get_memory_pool({init_pool_size_in_bytes}, {max_pool_size_in_bytes})
-```
-We also provide functions to query the allocated size.
-```python
-bytes = get_allocated_size()
-```
-
-
-## Operation Description
-PyCUTLASS provides operation description for GEMM, GEMM Grouped and Conv2d operations. These operation descriptions are assembled from four foundamental concepts
-* Math Instruction: math instruction executed in GPU cores
-* Tile Description: tiling sizes and pipeline stages
-* Operand Description: data type, layout, memory alignment
-* Epilogue Functor: epilogue function
-
-### Math Instruction
-
-The math instruction is defined as follows:
-```python
-math_inst = MathInstruction(
-    {instruction_shape}, {element_a}, {element_b},
-    {element_acc}, {opclass}, {math_operation}
-)
-```
-The `{instruction_shape}` and `{opclass}` defines the instruction size and type. The table below lists valid combinations. `{element_a}`, `{element_b}` define the source operand data type for each instructions, and `{element_acc}` defines the accumulator type. The `{math_operation}` defines the math operation applied. 
-
-|Opclass                   | element_a/element_b | element_acc     | instruction_shape | math_operation            |
-| --                       | --                  | --              | --                | --                        |
-| cutlass.OpClass.TensorOp | cutlass.float64     | cutlass.float64 | [8, 8, 4]         | MathOperation.multiply_add|
-|                          | cutlass.float32 cutlass.tfloat32, cutlass.float16 cutlass.bfloat16 | cutlass.float32 | [16, 8, 8] | MathOperation.multiply_add MathOperation.multiply_add_fast_f32 MathOperation.multiply_add_fast_f16 MathOperation.multiply_add_fast_bf16 |
-|        | cutlass.float16 | cutlass.float16/cutlass.float32|[16, 8, 16]| MathOperation.multiply_add |
-|        | cutlass.bfloat_16 | cutlass.float32 | [16, 8, 16]|MathOperation.multiply_add |
-|        | cutlass.int8 | cutlass.int32 | [16, 8, 32] | MathOperation.multiply_add_saturate|
-|cutlass.OpClass.Simt| cutlass.float64 | cutlass.float64 | [1, 1, 1] | MathOperation.multiply_add |
-| | cutlass.float32 | cutlass.float32 | [1, 1, 1] | MathOperation.multiply_add |
-
-The `cutlass.OpClass.TensorOp` indicates that the tensor core is used, while `cutlass.OpClass.Simt` uses the SIMT Core.
-
-The `multiply_add_fast_f32` emulates fast accurate SGEMM kernel which is accelerated
-using Ampere Tensor Cores. More details can be found in [examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm](examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm).
-
-### Tile Description
-The tile description describes the threadblock and warp tiling sizes, as well as the pipeline stages.
-```python
-tile_description = TileDescription(
-    {threadblock_shape}, {stages}, {warp_count},
-    math_inst
-)
-```
-The `{threadblock_shape}` is a list of 3 integers `[Tile_M, Tile_N, Tile_K]` that defines the threadblock tiling size. `{stages}` defines the number of software pipeline stages ([detail](https://developer.nvidia.com/blog/controlling-data-movement-to-boost-performance-on-ampere-architecture/)). `{warp_count}` defines the number of warps along `M`, `N`, and `K` dimension. I.e., with `{threadblock_shape}=[Tile_M, Tile_N, Tile_K]` and `{warp_count}=[W_M, W_N, W_K]`, the warp tile size would be `[Tile_M / W_M, Tile_N / W_N, Tile_K / W_K]`.
-
-### Operand Description
-The Operand Description defines the data type, layout, and memory alignment of input tensor A, B, and C. The output D shares the same attributes with C. The description is as follows:
-```python
-A = TensorDescription(
-    {element_a}, {layout_a}, {alignment_a}
-)
-
-B = TensorDescription(
-    {element_b}, {layout_b}, {alignment_b}
-)
-
-C = TensorDescription(
-    {element_c}, {layout_c}, {alignment_c}
-)
-```
-The table below lists the supported layout and data types for each operation
-| Operation | data type | layout |
-| --        | --        | --     |
-| GEMM, GEMM Grouped     | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.RowMajor, cutlass.ColumnMajor |
-|           | cutlass.int8 | cutlass.RowMajor, cutlass.ColumnMajor, cutlass.RowMajorInterleaved32, cutlass.ColumnMajorInterleaved32|
-| Conv2d Fprop, Dgrad, Wgrad | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.TensorNHWC |
-| Conv2d Fprop | cutlass.int8 | cutlass.TensorNHWC, cutlass.TensorNC32HW32, cutlass.TensorC32RSK32|
-
-### Epilogue Functor
-The epilogue functor defines the epilogue executed after mainloop.
-We expose the following epilogue functors.
-| Epilogue Functor | Remark |
-| --               | --     |
-| LinearCombination | $D=\alpha \times Accm + \beta \times C$ |
-| LinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, Output is clamped to the maximum value of the data type output |
-| FastLinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, only used for problem size $K\le 256$ for cutlass.int8, with accumulator data type `cutlass.int32` and epilogue compute data type `cutlass.float32` |
-| LinearCombinationGeneric | $D  = activation(\alpha \times Accm + \beta \times C)$, available activations include `relu`, `leaky_relu`, `tanh`, `sigmoid`, `silu`, `hardswish`, and `gelu` |
-
-The epilogue functors can be created as follows
-```python
-# LinearCombination
-epilogue_functor = LinearCombination(
-    element_C, alignment_c, element_acc, element_epilogue_compute
-)
-
-# LinearCombinationClamp
-epilogue_functor = LinearCombinationClamp(
-    element_C, alignment_c, element_acc, element_epilogue_compute
-)
-
-# FastLinearCombinationClamp
-epilogue_functor = FastLinearCombinationClamp(
-    element_C, alignment_c
-)
-
-# LinearCombinationGeneric
-epilogue_functor = LinearCombinationGeneric(
-    relu(element_epilogue_compute), element_C, alignment_c, 
-    element_acc, element_epilogue_compute
-)
-```
-
-We also provides an experimental feature "Epilogue Visitor Tree" for GEMM operation. The details can be found in [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md).
-
-
-### GEMM Operation
-
-The GEMM Operation description can be created with 
-```python
-operation = GemmOperationUniversal(
-    {compute_capability}, tile_description,
-    A, B, C, epilogue_functor, 
-    {swizzling_functor}, {visitor}
-)
-```
-* `{compute_capability}` is an integer indicates the compute capability of the GPU. For A100, it is 80.
-* `{swizzling_functor}` describes how threadblocks are scheduled on GPU. This is used to improve the L2 Locality ([detail](https://developer.nvidia.com/blog/optimizing-compute-shaders-for-l2-locality-using-thread-group-id-swizzling/)). Currently we support `cutlass.{IdentitySwizzle1|IdentitySwizzle2|IdentitySwizzle4|IdentitySwizzle8|BatchedIdentitySwizzle}`. The last one is used for batched or array GEMM.
-* `{visitor}`: a bool variable indicates whether the epilogue visitor tree is used.
-
-### GEMM Grouped Operation
-The GEMM Grouped Operation description can be created with 
-```python
-operation = GemmOperationGrouped(
-    compute_capability, tile_description,
-    A, B, C, epilogue_functor, 
-    swizzling_functor, {precompute_mode}
-)
-```
-* `{precompute_mode}`: It could be `SchedulerMode.Host` or `SchedulerMode.Device`. See [examples/24_gemm_grouped](examples/24_gemm_grouped) for more details.
-
-
-### Conv2d Operation
-The Conv2d Operation description can be created with
-```python
-operation = Conv2dOperation(
-    {conv_kind}, {iterator_algorithm},
-    compute_capability, tile_description,
-    A, B, C, {stride_support},
-    epilogue_functor, swizzling_functor
-)
-```
-* `{conv_kind}` defines which convolution is executed. Available options include `fprop`, `dgrad`, and `wgrad`.
-* `{iterator_algorithm}` specifies the iterator algorithm used by the implicit GEMM in convolution. The options are as follows:
-    * `analytic`: functionally correct in all cases but lower performance
-    * `optimized`: optimized for R <= 32, S <= 32 and unity-stride dgrad
-    * `fixed_channels`: analytic algorithm optimized for fixed channel count (C == AccessSize)
-    * `few_channels`: Analytic algorithm optimized for few channels (C divisible by AccessSize)
-* `{stride_support}`: distinguishes among partial specializations that accelerate certain problems where convolution
-stride is unit.
-    * `strided`: arbitrary convolution stride
-    * `unity`: unit convolution stride
-
-***
-## Code Emission and Compilation
-After implementing the operation description, the related host and device code can be compiled with
-```python
-import pycutlass
-
-pycutlass.compiler.add_module([operation,])
-```
-Several operations can be compiled together. The `nvcc` at `$CUDA_INSTALL_PATH/bin` is used by default as the compiler backend. But you can also switch to [CUDA Python](https://nvidia.github.io/cuda-python/overview.html)'s `nvrtc` with 
-```python
-pycutlass.compiler.nvrtc()
-```
-We also have an internal compiled artifact manager that caches the compiled kernel in both memory and disk. The `compiled_cache.db` at your workspace is the database that contains the binary files. You can delete the file if you want to recompile the kernels.
-***
-## Argument Processing
-We provide argument wrapper to convert python tensors to the kernel parameters. Currently it supports [torch.Tensor](https://pytorch.org/), [numpy.ndarray](https://numpy.org/), and [cupy.ndarray](https://cupy.dev/). 
-### GEMM Arguments
-The Gemm arguments can be created with
-```python
-arguments = GemmArguments(
-    operation=operation, problem_size={problem_size},
-    A={tensor_A}, B={tensor_B}, C={tensor_C}, D={tensor_D},
-    output_op={output_op},
-    gemm_mode={gemm_mode},
-    split_k_slices={split_k_slices}, batch={batch}
-)
-```
-* `problem_size` is a `cutlass.gemm.GemmCoord(M, N, K)` object that defines $M\times N\times K$ matrix multiplication.
-* `tensor_X`: user-provide tensors.
-* `output_op`: the params for the epilogue functor.
-* `gemm_mode`, `split_k_slices`, and `batch`:
-
-|gemm_mode| split_k_slices | batch | remark|
-|--|--|--|--|
-|cutlass.gemm.Mode.Gemm | number of split-K slices | - | the ordinary GEMM or GEMM with serial split-K|
-|cutlass.gemm.Mode.GemmSplitKParallel | number of split-K slices | - | GEMM Split-K Parallel|
-|cutlass.gemm.Mode.Batched | - | batch size | Batched GEMM |
-|cutlass.gemm.Mode.Array | - | batch size | Array GEMM |
-
-### GEMM Grouped Arguments
-The GEMM grouped arguments can be created with
-```python
-arguments = GemmGroupedArguments(
-    operation, {problem_sizes_coord}, {tensor_As}, {tensor_Bs}, {tensor_Cs}, {tensor_Ds},
-    output_op=output_op)
-)
-```
-* `problem_size_coord` is a list of `cutlass.gemm.GemmCoord(M, N, K)` for each problem size.
-* `tensor_Xs` is a list of user-provide tensors.
-* `output_op`: the params of the epilogue functor
-
-### Conv2d Arguments
-The Conv2d arguments can be created with
-```python
-arguments = Conv2dArguments(
-    operation, {problem_size}, {tensor_A},
-    {tensor_B}, {tensor_C}, {tensor_D}, 
-    {output_op}, 
-    {split_k_mode},
-    {split_k_slices}
-)
-```
-* `problem_size`: it can be constructed with
-    ```python
-    problem_size = cutlass.conv.Conv2dProblemSize(
-        cutlass.Tensor4DCoord(N, H, W, C),
-        cutlass.Tensor4DCoord(K, R, S, C),
-        cutlass.Tensor4DCoord(pad[0], pad[1], pad[2], pad[3]),
-        cutlass.MatrixCoord(stride[0], stride[1]),
-        cutlass.MatrixCoord(dilation[0], dilation[1]),
-        cutlass.conv.Mode.cross_correlation, 
-        split_k_slices, 1
-    )
-    ```
-* `tensor_X` are user-provide tensors
-* `output_op`: the params of the epilogue functor
-* `split_k_mode`: currently we support `cutlass.conv.SplitKMode.Serial` and `cutlass.conv.SplitKMode.Parallel`.
-* `split_k_slice`: number of split-k slices
-
-For ordinary conv2d, just use `cutlass.conv.SplitKMode.Serial` with `split_k_slice=1`.
-
-### Getting output_op
-The way to create output_op is listed below
-```python
-output_op = operation.epilogue_type(*([alpha, beta] + args.activation_args)),
-```
-It is a list of arguments start with the scaling factor `alpha` and `beta`.
-The `output_op` of EpilogueVisitorTree is slightly different. Please check [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md) for details.
-
-
-## Kernel Launching
-
-With the arguments and operations, the kernel can be launched simply with
-```python
-operation.run(arguments)
-```
-
-## Sync results
-
-We also provide function to synchronize the kernel execution. If you use `numpy`, it will also copy the result back to host. To do that, run
-```python
-arguments.sync()
-```
-If you use EpilogueVisitorTree, please call
-```python
-output_op.sync()
-```
-
-## Reduction Kernel behind Parallel Split-K
-
-If you use parallel-split-K in GEMM or Conv2d, an additional reduction kernel is required. Please check [examples/40_cutlass_py](examples/40_cutlass_py) for detail.
--- a/tools/library/scripts/pycutlass/docs/source/user_guide.rst
+++ b/tools/library/scripts/pycutlass/docs/source/user_guide.rst
@ -1,4 +0,0 @@
-User Guide
-=====================================
-
-.. mdinclude:: ./md/basic_idea.md
--- a/tools/library/scripts/pycutlass/docs/source/visitor_tree.rst
+++ b/tools/library/scripts/pycutlass/docs/source/visitor_tree.rst
@ -1,4 +0,0 @@
-User Guide
-=====================================
-
-.. mdinclude:: ./md/EpilogueVisitorTree.md
--- a/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py
@ -1,106 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from pycutlass import *
-import pycutlass
-from pycutlass.epilogue import LinearCombination
-from pycutlass.test.conv2d_testbed import Conv2dLauncher
-
-
-if __name__ == "__main__":
-    pycutlass.get_memory_pool(2**33, 2**33)
-    pycutlass.compiler.nvcc()
-
-    math_inst = MathInstruction(
-        instruction_shape=[16, 8, 16],
-        element_a=cutlass.float16, element_b=cutlass.float16,
-        element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-        math_operation=MathOperation.multiply_add
-    )
-
-    A = TensorDescription(
-        element=math_inst.element_a, 
-        layout=cutlass.TensorNHWC,
-        alignment=8)
-    B = TensorDescription(
-        element=math_inst.element_b, 
-        layout=cutlass.TensorNHWC, 
-        alignment=8)
-    C = TensorDescription(
-        element=cutlass.float32,
-        layout=cutlass.TensorNHWC, 
-        alignment=8)
-
-    tile_description = TileDescription(
-        threadblock_shape=[128, 128, 64], stages=4, 
-        warp_count=[2, 2, 1],
-        math_instruction=math_inst
-    )
-
-    epilogue_functor = LinearCombination(cutlass.float32, 4, cutlass.float32, cutlass.float32)
-
-    operation = Conv2dOperation(
-        conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-        arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-        element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
-        epilogue_functor=epilogue_functor,
-        swizzling_functor=cutlass.IdentitySwizzle1
-    )
-
-    profiler = Conv2dLauncher(operation, verification=False, profiling=True)
-
-    python_runtime = profiler.run(
-        problem_size = cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(32, 224, 224, 128),
-            cutlass.Tensor4DCoord(128, 3, 3, 128),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ), split_k_mode=cutlass.conv.SplitKMode.Serial
-    )
-
-
-    cpp_runtime = profiler.run_cutlass_profiler(
-        problem_size = cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(32, 224, 224, 128),
-            cutlass.Tensor4DCoord(128, 3, 3, 128),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ), split_k_mode=cutlass.conv.SplitKMode.Serial
-    )
-
-    print(cpp_runtime / python_runtime)
--- a/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py
@ -1,91 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import pycutlass
-from pycutlass import *
-from pycutlass.test import *
-from pycutlass.test.gemm_testbed import GemmUniversalLauncher
-
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**32, 2**32)
-    pycutlass.compiler.nvcc()
-
-    math_inst = MathInstruction(
-        instruction_shape=[16, 8, 16],
-        element_a=cutlass.float16, element_b=cutlass.float16,
-        element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-        math_operation=MathOperation.multiply_add
-    )
-
-    tile_description = TileDescription(
-        threadblock_shape=[256, 128, 32],
-        stages=3, warp_count=[4, 2, 1],
-        math_instruction=math_inst
-    )
-
-    A = TensorDescription(
-        element=cutlass.float16, layout=cutlass.RowMajor,
-        alignment=4
-    )
-    B = TensorDescription(
-        element=cutlass.float16, layout=cutlass.RowMajor,
-        alignment=4
-    )
-    C = TensorDescription(
-        element=cutlass.float32, layout=cutlass.ColumnMajor,
-        alignment=4
-    )
-
-    element_epilogue = cutlass.float32
-
-    epilogue_functor = LinearCombination(cutlass.float32, 4, cutlass.float32, cutlass.float32)
-    
-    swizzling_functor = cutlass.IdentitySwizzle1
-
-    operation = GemmOperationUniversal(
-        arch=80, tile_description=tile_description,
-        A=A, B=B, C=C, element_epilogue=element_epilogue,
-        epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-    )
-
-    profiler = GemmUniversalLauncher(operation, verification=False, profiling=True)
-    python_runtime = profiler.run(
-        mode=cutlass.gemm.Mode.Gemm, 
-        problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096)
-    )
-
-    cpp_runtime = profiler.run_cutlass_profiler(
-        mode=cutlass.gemm.Mode.Gemm,
-        problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096),
-    )
-
-    print(cpp_runtime / python_runtime)
--- a/tools/library/scripts/pycutlass/pyproject.toml
+++ b/tools/library/scripts/pycutlass/pyproject.toml
@ -1,9 +0,0 @@
-[build-system]
-
-requires = [
-    "setuptools",
-    "scikit-build>0.13.1",
-    "pybind11",
-    "numpy<1.23",
-    "cmake>=3.20.1,!=3.23.0"
-]
--- a/tools/library/scripts/pycutlass/setup.py
+++ b/tools/library/scripts/pycutlass/setup.py
@ -1,116 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import distutils.cmd
-from setuptools import setup
-import setuptools.command.build_py
-import os
-
-# build rmm dependency
-class BuildRMM(distutils.cmd.Command):
-    user_options = []
-    def initialize_options(self):
-        pass
-    def finalize_options(self):
-        pass
-    def run(self):
-        try:
-            import rmm
-        except ImportError:
-            print("installing rmm")
-            os.system("git clone -b branch-22.10 --recurse-submodules https://github.com/rapidsai/rmm.git")
-            os.chdir("./rmm")
-            os.system("./build.sh librmm rmm")
-            os.chdir("./python")
-            os.system("python setup.py build_ext --inplace")
-            os.system("python setup.py install")
-
-cutlass_path = os.getenv('CUTLASS_PATH')
-assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
-cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
-assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
-
-ext_modules = []
-
-try:
-    from pybind11.setup_helpers import Pybind11Extension, build_ext
-    include_dirs = [
-        cutlass_path + "/include",
-        cuda_install_path + "/include",
-        cutlass_path + "/tools/util/include",
-        cutlass_path + "/test",
-        cutlass_path + "/tools/library/scripts/pycutlass/googletest/googletest/include"
-    ]
-
-    ext_modules = [
-        Pybind11Extension("cutlass",
-                          ["src/cpp/cutlass.cpp"],
-                          include_dirs=include_dirs,
-                          extra_compile_args=["-fpermissive", "-w", "-std=c++17"]),
-        Pybind11Extension("cute",
-                          ["src/cpp/cute.cpp"],
-                          include_dirs=include_dirs,
-                          extra_compile_args=["-fpermissive", "-w", "-std=c++17"])
-    ]
-except ImportError:
-    pass
-
-setup(
-    name="PyCutlass",
-    version="0.0.1",
-    author="Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall",
-    author_email="zhaodongc@nvidia.com",
-    description="Python interface for CUTLASS",
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    package_dir={"": "src"},
-    packages=['pycutlass', 'pycutlass.utils', 'pycutlass.test'],
-    setup_requires=["pybind11", "numpy<1.23"],
-    install_requires=[
-        "numpy<1.23",
-        'pybind11',
-        'cuda-python>=11.8.0',
-        'typeguard',
-        'bfloat16',
-        'typing',
-        'scikit-build',
-        'treelib'
-    ],
-    cmdclass={
-        'rmm': BuildRMM
-    },
-    ext_modules=ext_modules,
-    python_requires=">=3.6",
-)
--- a/tools/library/scripts/pycutlass/src/cpp/compiler.h
+++ b/tools/library/scripts/pycutlass/src/cpp/compiler.h
@ -1,75 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief In-memory compiled artifact cache
-*/
-
-#include <pybind11/pybind11.h>
-#include <string>
-#include <unordered_map>
-
-
-namespace py = pybind11;
-
-namespace cutlass {
-
-struct CompileCache {
-public:
-    CompileCache() = default;
-    ~CompileCache() = default;
-
-    using Cache = std::unordered_map<std::string, py::object>;
-
-    /// Check if the kernel has already been compiled
-    py::object at(const std::string &kernel) {
-        auto item = cache_.find(kernel);
-
-        if (item != cache_.end()) {
-            return item->second;
-        }
-        return py::none();
-    }
-
-    /// Insert a new compiled kernel for new configuration
-    void insert(const std::string &kernel, const py::object &compiled_kernel){
-        cache_.emplace(kernel, compiled_kernel);
-    }
-
-    const int64_t size() const { return cache_.size(); }
-
-    /// Clear the cache
-    void clear() { cache_.clear(); }
-
-private:
-    Cache cache_;
-};
-
-} // namespace cutlass
--- a/tools/library/scripts/pycutlass/src/cpp/cute.cpp
+++ b/tools/library/scripts/pycutlass/src/cpp/cute.cpp
@ -1,54 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief binding CuTe C++ APIs to Python
-*/
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cute/arch/mma_sm90_gmma.hpp"
-
-namespace py = pybind11;
-
-
-PYBIND11_MODULE(cute, m) {
-
-    // module doc
-    m.doc() = "CuTe C++ bindings";
-
-    py::enum_<cute::GMMA::Major>(m, "GMMAMajor",
-        R"pbdoc(classification of CuTe GMMA tensor major specification)pbdoc")
-        .value("K", cute::GMMA::Major::K,
-            R"pbdoc(Tensor is contiguous in reduction dimension)pbdoc")
-        .value("MN", cute::GMMA::Major::MN,
-            R"pbdoc(Tensor is contiguous in non-reduction dimension)pbdoc");
-}
--- a/tools/library/scripts/pycutlass/src/cpp/cutlass.cpp
+++ b/tools/library/scripts/pycutlass/src/cpp/cutlass.cpp
@ -1,182 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief binding CUTLASS C++ APIs to Python
-*/
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "builtin_types.h"
-#include "device_launch_parameters.h"
-#include "stddef.h"
-#include "cutlass/cutlass.h"
-
-#include "include/conv/convolution.h"
-#include "include/gemm/gemm.h"
-#include "include/types.h"
-#include "include/layout/layout.h"
-#include "include/tensor_coord.h"
-#include "include/arch.h"
-#include "include/tensor_ref_view.h"
-#include "include/swizzling.h"
-#include "test/conv/convolution.h"
-#include "test/gemm/gemm.h"
-
-
-// Data Types
-#include "library.h"
-
-// compiler
-#include "compiler.h"
-
-
-namespace py = pybind11;
-
-
-PYBIND11_MODULE(cutlass, m) {
-
-    // module doc
-    m.doc() = "cutlass C++ binding";
-
-    //
-    // Bind data type
-    //
-    bind_cutlass_types(m);
-
-    //
-    // Bind layout
-    //
-    bind_layout(m);
-
-    //
-    // Bind tensor coord
-    //
-    bind_tensor_coord(m);
-
-    //
-    // Bind tensor ref
-    //
-    bind_tensor_refs_and_views(m);
-
-    //
-    // Bind opcode
-    //
-    bind_opcode(m);
-
-    //
-    // Bind convolution
-    //
-    py::module_ conv_submodule = m.def_submodule("conv");
-    bind_convolution(conv_submodule);
-
-    //
-    // Bind gemm
-    //
-    py::module_ gemm_submodule = m.def_submodule("gemm");
-    bind_gemm(gemm_submodule);
-
-    //
-    // Bind swizzling
-    //
-    bind_threadblock_swizzle(m);
-
-
-    //
-    // Bind test units
-    //
-    py::module_ test = m.def_submodule("test");
-    py::module_ test_conv = test.def_submodule("conv");
-    bind_convolution_test(test_conv);
-
-    py::module_ test_gemm = test.def_submodule("gemm");
-    bind_gemm_test(test_gemm);
-
-    // data types
-    py::enum_<cutlass::DataType>(m, "dtype")
-        .value("b1", cutlass::DataType::kB1)
-        .value("u2", cutlass::DataType::kU2)
-        .value("u4", cutlass::DataType::kU4)
-        .value("u8", cutlass::DataType::kU8)
-        .value("u16", cutlass::DataType::kU16)
-        .value("u32", cutlass::DataType::kU32)
-        .value("u64", cutlass::DataType::kU64)
-        .value("s2", cutlass::DataType::kS2)
-        .value("s4", cutlass::DataType::kS4)
-        .value("s16", cutlass::DataType::kS16)
-        .value("s64", cutlass::DataType::kS64)
-        .value("cf16", cutlass::DataType::kCF16)
-        .value("cbf16", cutlass::DataType::kCBF16)
-        .value("cf32", cutlass::DataType::kCF32)
-        .value("ctf32", cutlass::DataType::kCTF32)
-        .value("cf64", cutlass::DataType::kCF64)
-        .value("cs2", cutlass::DataType::kCS2)
-        .value("cs4", cutlass::DataType::kCS4)
-        .value("cs8", cutlass::DataType::kCS8)
-        .value("cs16", cutlass::DataType::kCS16)
-        .value("cs32", cutlass::DataType::kCS32)
-        .value("cs64", cutlass::DataType::kCS64)
-        .value("cu2", cutlass::DataType::kCU2)
-        .value("cu4", cutlass::DataType::kCU4)
-        .value("cu8", cutlass::DataType::kCU8)
-        .value("cu16", cutlass::DataType::kCU16)
-        .value("cu32", cutlass::DataType::kCU32)
-        .value("cu64", cutlass::DataType::kCU64)
-        .value("invalid", cutlass::DataType::kInvalid);
-    
-    // layout types
-    py::enum_<cutlass::LayoutType>(m, "layout")
-        .value("ColumnMajorInterleaved2", cutlass::LayoutType::kColumnMajorInterleaved2)
-        .value("RowMajorInterleaved2", cutlass::LayoutType::kRowMajorInterleaved2)
-        .value("ColumnMajorInterleaved64", cutlass::LayoutType::kColumnMajorInterleaved64)
-        .value("RowMajorInterleaved64", cutlass::LayoutType::kRowMajorInterleaved64)
-        .value("TensorNDHWC", cutlass::LayoutType::kTensorNDHWC)
-        .value("TensorNCHW", cutlass::LayoutType::kTensorNCHW)
-        .value("TensorNGHWC", cutlass::LayoutType::kTensorNGHWC)
-        .value("TensorNC64HW64", cutlass::LayoutType::kTensorNC64HW64)
-        .value("TensorC64RSK64", cutlass::LayoutType::kTensorC64RSK64);
-    
-    // transform types
-    py::enum_<cutlass::ComplexTransform>(m, "complex_transform")
-        .value("none", cutlass::ComplexTransform::kNone)
-        .value("conj", cutlass::ComplexTransform::kConjugate);
-
-    //
-    // Compiler
-    //
-    py::class_<cutlass::CompileCache>(m, "CompileCache")
-        .def(py::init<>())
-        .def("at", &cutlass::CompileCache::at)
-        .def("insert", &cutlass::CompileCache::insert)
-        .def("size", &cutlass::CompileCache::size)
-        .def("clear", &cutlass::CompileCache::clear);
-
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/arch.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/arch.h
@ -1,59 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind opcode classes to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/arch/mma.h"
-
-namespace py = pybind11;
-
-namespace cutlass {
-enum class OpcodeClass {
-    kSimt, kTensorOp, kWmmaTensorOp, kSparseTensorOp
-};
-}
-
-void bind_opcode(py::module &m) {
-    py::enum_<cutlass::OpcodeClass>(m, "OpClass",
-        R"pbdoc(classification of math operators)pbdoc")
-        .value("Simt", cutlass::OpcodeClass::kSimt, 
-            R"pbdoc(Tag classifying math operators as thread-level operations)pbdoc")
-        .value("TensorOp", cutlass::OpcodeClass::kTensorOp, 
-            R"pbdoc(Tag classifying operators as Tensor Core operations)pbdoc")
-        .value("WmmaTensorOp", cutlass::OpcodeClass::kWmmaTensorOp, 
-            R"pbdoc(Tag classifying operators as WMMA Tensor Core operations)pbdoc")
-        .value("SparseTensorOp", cutlass::OpcodeClass::kSparseTensorOp, 
-            R"pbdoc(Tag classifying operators as sparseTensor Core operations)pbdoc");
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h
@ -1,102 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Convolution problem sizes to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/conv/conv2d_problem_size.h"
-
-namespace py = pybind11;
-
-void bind_conv_problem_size(py::module &m) {
-    //
-    // Conv2d Problem Size: 
-    // include/cutlass/conv/conv2d_problem_size.h
-    //
-    py::class_<cutlass::conv::Conv2dProblemSize>(m, "Conv2dProblemSize")
-         // constructors
-        .def(py::init<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, cutlass::conv::Mode, int, int>())
-        .def(py::init<cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::MatrixCoord, cutlass::MatrixCoord, cutlass::conv::Mode, int, int>())
-        // attribute accessors
-        .def_readwrite("N", &cutlass::conv::Conv2dProblemSize::N)
-        .def_readwrite("H", &cutlass::conv::Conv2dProblemSize::H)
-        .def_readwrite("W", &cutlass::conv::Conv2dProblemSize::W)
-        .def_readwrite("C", &cutlass::conv::Conv2dProblemSize::C)
-        .def_readwrite("P", &cutlass::conv::Conv2dProblemSize::P)
-        .def_readwrite("Q", &cutlass::conv::Conv2dProblemSize::Q)
-        .def_readwrite("K", &cutlass::conv::Conv2dProblemSize::K)
-        .def_readwrite("R", &cutlass::conv::Conv2dProblemSize::R)
-        .def_readwrite("S", &cutlass::conv::Conv2dProblemSize::S)
-        .def_readwrite("pad_h", &cutlass::conv::Conv2dProblemSize::pad_h)
-        .def_readwrite("pad_w", &cutlass::conv::Conv2dProblemSize::pad_w)
-        .def_readwrite("stride_h", &cutlass::conv::Conv2dProblemSize::stride_h)
-        .def_readwrite("stride_w", &cutlass::conv::Conv2dProblemSize::stride_w)
-        .def_readwrite("dilation_h", &cutlass::conv::Conv2dProblemSize::dilation_h)
-        .def_readwrite("dilation_w", &cutlass::conv::Conv2dProblemSize::dilation_w)
-        .def_readwrite("mode", &cutlass::conv::Conv2dProblemSize::mode)
-        .def_readwrite("split_k_slices", &cutlass::conv::Conv2dProblemSize::split_k_slices)
-        .def_readwrite("groups", &cutlass::conv::Conv2dProblemSize::groups)
-        // functions
-        .def("reset_split_k_slices", &cutlass::conv::Conv2dProblemSize::reset_split_k_slices)
-        .def("activation_extent", &cutlass::conv::Conv2dProblemSize::activation_extent)
-        .def("filter_extent", &cutlass::conv::Conv2dProblemSize::filter_extent)
-        .def("output_extent", &cutlass::conv::Conv2dProblemSize::output_extent)
-        .def("activation_size", &cutlass::conv::Conv2dProblemSize::activation_size)
-        .def("filter_size", &cutlass::conv::Conv2dProblemSize::filter_size)
-        .def("output_size", &cutlass::conv::Conv2dProblemSize::output_size);
-    
-    // Get tensor size
-    m.def("implicit_gemm_tensor_a_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_a_size));
-    m.def("implicit_gemm_tensor_b_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_b_size));
-    m.def("implicit_gemm_tensor_c_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_c_size));
-
-    // Get tensor extent
-    m.def("implicit_gemm_tensor_a_extent",
-        py::overload_cast<
-            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
-        >(&cutlass::conv::implicit_gemm_tensor_a_extent));
-
-    m.def("implicit_gemm_tensor_b_extent",
-        py::overload_cast<
-            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
-        >(&cutlass::conv::implicit_gemm_tensor_b_extent));
-    
-    m.def("implicit_gemm_tensor_c_extent",
-        py::overload_cast<
-            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
-        >(&cutlass::conv::implicit_gemm_tensor_c_extent));
-    
-    m.def("implicit_gemm_problem_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize &>(&cutlass::conv::implicit_gemm_problem_size));
-    
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/conv/convolution.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/conv/convolution.h
@ -1,91 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind convolution related enum types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "conv_problem_size.h"
-#include "host.h"
-#include "cutlass/conv/convolution.h"
-
-namespace py = pybind11;
-
-void bind_convolution(py::module &m) {
-    //
-    // Enumerate types
-    // cutlass/include/cutlass/conv/convolution.h
-    //
-
-    /// Convolutional operator
-    py::enum_<cutlass::conv::Operator>(m, "Operator", R"pbdoc(Convolutional operator)pbdoc")
-        .value("fprop", cutlass::conv::Operator::kFprop, "Forward propagation")
-        .value("dgrad", cutlass::conv::Operator::kDgrad, "Activation grad")
-        .value("wgrad", cutlass::conv::Operator::kWgrad, "Weight grad");
-
-    /// Distinguishes convolution  from cross correlation
-    py::enum_<cutlass::conv::Mode>(m, "Mode")
-        .value("cross_correlation", cutlass::conv::Mode::kCrossCorrelation)
-        .value("convolution", cutlass::conv::Mode::kConvolution);
-    
-    /// Selects among several implementation variants trading off performance with simplicity
-    py::enum_<cutlass::conv::IteratorAlgorithm>(m, "IteratorAlgorithm",
-        R"pbdoc(Selects among several implementation variants trading off performance with simplicity)pbdoc")
-        .value("analytic", cutlass::conv::IteratorAlgorithm::kAnalytic, R"pbdoc(functionally correct in all cases but lower performance)pbdoc")
-        .value("optimized", cutlass::conv::IteratorAlgorithm::kOptimized, R"pbdoc(optimized for R <= 32, S <= 32 and unity-stride dgrad)pbdoc")
-        .value("fixed_channels", cutlass::conv::IteratorAlgorithm::kFixedChannels, R"pbdoc(Analytic algorithm optimized for fixed channel count (C == AccessSize))pbdoc")
-        .value("few_channels", cutlass::conv::IteratorAlgorithm::kFewChannels, R"pbdoc(Analytic algorithm optimized for few channels (C divisible by AccessSize))pbdoc");
-    
-    /// Distinguishes among partial specializations that accelerate certain problems where convolution
-    /// stride is unit.
-    py::enum_<cutlass::conv::StrideSupport>(m, "StrideSupport",
-        R"pbdoc(Distinguishes among partial specializations that accelerate certain problems where convolution
-        stride is unit.)pbdoc")
-        .value("strided", cutlass::conv::StrideSupport::kStrided, R"pbdoc(arbitrary convolution stride)pbdoc")
-        .value("unity", cutlass::conv::StrideSupport::kUnity, R"pbdoc(unit convolution stride)pbdoc");
-    
-    /// Identifies split-K mode
-    py::enum_<cutlass::conv::SplitKMode>(m, "SplitKMode")
-        .value("None", cutlass::conv::SplitKMode::kNone)
-        .value("Serial", cutlass::conv::SplitKMode::kSerial)
-        .value("Parallel", cutlass::conv::SplitKMode::kParallel);
-    
-    // Conv problem sizes
-    bind_conv_problem_size(m);
-
-    //
-    // host helper functions
-    //
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_conv_host_helper(host_submodule);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/conv/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/conv/host.h
@ -1,54 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind conv host helpers to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/util/host_reorder.h"
-#include "cutlass/layout/tensor.h"
-
-namespace py = pybind11;
-
-
-void bind_conv_host_helper(py::module &m) {
-
-    /// reorder operand B for interleaved layout
-    m.def("reorder_convK", [](
-        cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> dest,
-        cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> src,
-        cutlass::conv::Operator conv_op, const cutlass::conv::Conv2dProblemSize & problem_size) {
-            cutlass::gemm::GemmCoord implicit_problem_size = cutlass::conv::implicit_gemm_problem_size(conv_op, problem_size);
-            cutlass::reorder_convK<32>(dest, src, implicit_problem_size);
-        });
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_generic.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_generic.h
@ -1,222 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-
-  \brief A generic wrapper around an epilogue visitor operation
-*/
-
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
-
-#include "epilogue_visitor_op/visitor_op_linear_combination.h"
-#include "epilogue_visitor_op/visitor_op_tensor_input.h"
-#include "epilogue_visitor_op/visitor_op_accumulator.h"
-#include "epilogue_visitor_op/visitor_op_row_broadcast.h"
-#include "epilogue_visitor_op/visitor_op_tensor_output.h"
-#include "epilogue_visitor_op/visitor_op_column_reduction.h"
-#include "epilogue_visitor_op/visitor_op_row_reduction.h"
-#include "epilogue_visitor_op/visitor_op_column_broadcast.h"
-#include "epilogue_visitor_op/visitor_op_unary.h"
-#include "epilogue_visitor_op/visitor_op_binary.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Generic Epilogue Visitor.
-template <
-  typename OutputOp_
->
-class EpilogueVisitorGeneric {
-public:
-
-  using OutputOp = OutputOp_;
-  using AccumulatorAccessType = typename OutputOp::AccumulatorAccessType;
-  static int const kElementsPerAccess = OutputOp::kElementsPerAccess;
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using OutputTileIterator = typename OutputOp::OutputTileIterator;
-
-  static int const kIterations = OutputTileIterator::kIterations;
-
-  ///
-  /// End Epilogue Tree
-  ///
-
-  /// Additional SMEM bufer is not required in the broadcast epilogue visitor
-  struct SharedStorage {
-
-    typename OutputOp::SharedStorage output_smem;
-    CUTLASS_HOST_DEVICE
-    SharedStorage() { }
-  };
-
-public:
-
-  /// Argument structure
-  struct Arguments {
-    typename OutputOp::Arguments output_op_args;
-    //
-    // Methods
-    //
-    Arguments() { }
-
-    Arguments(
-      typename OutputOp::Arguments output_op_args
-    ):
-      output_op_args(output_op_args)
-    {
-
-    }
-  };
-
-  struct Params {
-    typename OutputOp::Params output_op_params;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args):
-      output_op_params(args.output_op_args)
-    {
-
-    }
-  };
-
-
-
-private:
-
-  OutputOp output_op;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueVisitorGeneric(
-    Params const &params,                                         ///< Parameters routed to the epilogue
-    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
-    MatrixCoord threadblock_offset,
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    MatrixCoord problem_size
-  ):
-    output_op(params.output_op_params, shared_storage.output_smem, thread_idx, threadblock_offset, problem_size)
-  { }
-
-  /// Helper to indicate split-K behavior
-  CUTLASS_DEVICE
-  void set_k_partition(
-    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
-    int split_k_slices) {                                         ///< Total number of split-K slices
-
-  }
-
-  /// Called to set the batch index
-  CUTLASS_DEVICE
-  void set_batch_index(int batch_idx) {
-    output_op.set_batch_index(batch_idx);
-  }
-
-  /// Called at the start of the epilogue just before iterating over accumulator slices
-  CUTLASS_DEVICE
-  void begin_epilogue() {
-    output_op.begin_epilogue();
-  }
-
-  /// Called at the start of one step before starting accumulator exchange
-  CUTLASS_DEVICE
-  void begin_step(int step_idx) {
-    output_op.begin_step(step_idx);
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void begin_row(int row_idx) {
-    output_op.begin_row(row_idx);
-  }
-
-  /// Called after accumulators have been exchanged for each accumulator vector
-  CUTLASS_DEVICE
-  void visit(
-    int iter_idx,
-    int row_idx,
-    int column_idx,
-    int frag_idx,
-    AccumulatorAccessType const &accum) {
-      output_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void end_row(int row_idx) {
-    output_op.end_row(row_idx);
-
-  }
-
-  /// Called after all accumulator elements have been visited
-  CUTLASS_DEVICE
-  void end_step(int step_idx) {
-    output_op.end_step(step_idx);
-  }
-
-  /// Called after all steps have been completed
-  CUTLASS_DEVICE
-  void end_epilogue() {
-    output_op.end_epilogue();
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/binary_ops.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/binary_ops.h
@ -1,84 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the binary ops
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/// Scalar multiplication
-template <typename T, int N>
-struct VectorAdd {
-
-    struct Arguments {
-        int tmp;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():tmp(0){ }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(int tmp): tmp(tmp) { }
-    };
-    
-    struct Params {
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args) { }
-    };
-
-    CUTLASS_HOST_DEVICE
-    VectorAdd(
-        Params const &params
-    ) { }
-
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-        cutlass::plus<Array<T, N>> add_op;
-        return add_op(lhs, rhs);
-    }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h
@ -1,233 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the unary ops
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/// Scalar multiplication
-template <typename T, int N>
-struct Mult {
-
-    struct Arguments {
-        T alpha;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():alpha(T(1.0)){ }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(T alpha): alpha(alpha) { }
-    };
-    
-    struct Params {
-        T alpha;   ///< scales accumulators
-
-        CUTLASS_HOST_DEVICE
-        Params():alpha(T(1.0)){ }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args): alpha(args.alpha) { }
-    };
-
-    T alpha_;
-
-    CUTLASS_HOST_DEVICE
-    Mult(
-        Params const &params
-    ):
-        alpha_(params.alpha)
-    { }
-
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &source) const {
-        cutlass::multiplies<Array<T, N>> multiply_op;
-        return multiply_op(source, alpha_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return alpha_ != T(0);
-    }
-
-};
-
-
-/// ReLU
-template <typename T, int N>
-struct ReLUVisitor {
-    struct Arguments {
-        T threshold;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():threshold(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(T threshold): threshold(threshold) { }
-    };
-
-    struct Params {
-        T threshold;
-
-        CUTLASS_HOST_DEVICE
-        Params():threshold(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args): threshold(args.threshold) { }
-    };
-
-    T threshold_;
-
-    CUTLASS_HOST_DEVICE
-    ReLUVisitor(Params const &params):
-        threshold_(params.threshold) { }
-    
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &frag) const {
-        maximum<Array<T, N>> mx;
-        return mx(frag, threshold_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return true;
-    }
-};
-
-/// leakyReLU
-template <typename T, int N>
-struct LeakyReLUVisitor {
-    struct Arguments {
-        T leaky_alpha;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():leaky_alpha(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(T leaky_alpha): leaky_alpha(leaky_alpha) { }
-    };
-
-    struct Params {
-        T leaky_alpha;
-
-        CUTLASS_HOST_DEVICE
-        Params():leaky_alpha(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args): leaky_alpha(args.leaky_alpha) { }
-    };
-
-    T leaky_alpha_;
-
-    CUTLASS_HOST_DEVICE
-    LeakyReLUVisitor(Params const &params):
-        leaky_alpha_(params.leaky_alpha) { }
-    
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &frag) const {
-        cutlass::epilogue::thread::LeakyReLU<Array<T, N>> leaky_op;
-        return leaky_op(frag, leaky_alpha_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return true;
-    }
-    
-};
-
-/// Tanh
-template <typename T, int N>
-struct TanhVisitor {
-    /// Argument
-    struct Arguments {
-        // a placeholder argument to ensure correctness of ctypes
-        int tmp;
-
-        CUTLASS_HOST_DEVICE
-        Arguments(): tmp(0) { };
-
-        CUTLASS_HOST_DEVICE
-        Arguments(int tmp): tmp(tmp) { };
-    };
-
-    /// Param
-    struct Params {
-        CUTLASS_HOST_DEVICE
-        Params(){ };
-        Params(Arguments const &args) { }
-    };
-
-    /// Constructor
-    CUTLASS_HOST_DEVICE
-    TanhVisitor(Params const &params) { }
-
-    // scalar operator
-    CUTLASS_HOST_DEVICE
-    T tanh_op(T const &scalar) const {
-        return fast_tanh(scalar);
-    }
-
-    /// vector operator
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &frag) const {
-        Array<T, N> y;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i=0; i < N; ++i) {
-            y[i] = tanh_op(frag[i]);
-        }
-
-        return y;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return true;
-    }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_accumulator.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_accumulator.h
@ -1,148 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with accumulator
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following Computation
-///
-/// ElementAccumulator accum;
-/// return accum;
-///
-/// It can only be the leaf node of the epilogue tree
-
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    int      kElementsPerAccess_    ///< Number of elements computed per operation
->
-class VisitorOpAccumulator{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    /// Fragment type for Accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = AccumulatorAccessType;
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        // Note: it is strange that ctypes will return issue with empty arguments
-        int tmp;
-
-        CUTLASS_HOST_DEVICE
-        Arguments() { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(int tmp): tmp(tmp) { }
-    };
-
-    /// Parameter structure
-    struct Params {
-
-        CUTLASS_HOST_DEVICE
-        Params() { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args) { }
-    };
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpAccumulator(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ) { }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) { }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() { }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        return accum;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h
@ -1,245 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Binary op
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "binary_ops.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementCompute alpha;
-///  ElementCompute beta;
-///  ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B) 
-///  Return C;
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename ElementCompute_,      ///< Data type used to compute linear combination
-    int      kElementsPerAccess_,   ///< Number of elements computed per operation
-    typename VisitorA_,            ///< Child node A      
-    typename VisitorB_,            ///< Child node B
-    template<typename T, int N> typename BinaryOp_
->
-class VisitorOpBinary{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementCompute = ElementCompute_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    using VisitorA = VisitorA_;
-    using VisitorB = VisitorB_;
-
-    /// Fragment type returned from VisitorA.visit
-    using VisitAccessTypeA = typename VisitorA::VisitAccessType;
-    using ElementA = typename VisitAccessTypeA::Element;
-
-    /// Fragment type returned from VisitorB.visit
-    using VisitAccessTypeB = typename VisitorB::VisitAccessType;
-    using ElementB = typename VisitAccessTypeB::Element;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = Array<ElementCompute, kElementsPerAccess>; 
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    using BinaryOp = BinaryOp_<ElementCompute, kElementsPerAccess>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
-    static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        typename VisitorA::SharedStorage storage_a;
-        typename VisitorB::SharedStorage storage_b;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        typename BinaryOp::Arguments binary_arg;
-        typename VisitorA::Arguments visitor_a_arg;    ///< Argument type for visitor_a
-        typename VisitorB::Arguments visitor_b_arg;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Arguments():binary_arg() { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            typename BinaryOp::Arguments binary_arg,
-            typename VisitorA::Arguments visitor_a_arg,
-            typename VisitorB::Arguments visitor_b_arg
-        ):
-            binary_arg(binary_arg),
-            visitor_a_arg(visitor_a_arg),
-            visitor_b_arg(visitor_b_arg)
-        { }
-    };
-
-    /// Parameter structure
-    struct Params {
-        typename BinaryOp::Params binary_param;
-        typename VisitorA::Params visitor_a_param;    ///< Argument type for visitor_a
-        typename VisitorB::Params visitor_b_param;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Params() { }
-        
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            binary_param(args.binary_arg),
-            visitor_a_param(args.visitor_a_arg),
-            visitor_b_param(args.visitor_b_arg)
-        { }
-    };
-
-private:
-    //
-    // Data members
-    //
-
-    BinaryOp binary_op;
-
-    VisitorA visitor_a_op;
-    VisitorB visitor_b_op;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpBinary(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        binary_op(params.binary_param),
-        visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
-        visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
-    { }
-
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_a_op.begin_epilogue();
-        visitor_b_op.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        visitor_a_op.set_batch_index(batch_idx);
-        visitor_b_op.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        visitor_a_op.begin_step(step_idx);
-        visitor_b_op.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_a_op.begin_row(row_idx);
-        visitor_b_op.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) { 
-        /// Get result from visitor A and visitor B
-        VisitAccessTypeA result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        VisitAccessTypeB result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        /// Type conversion
-        NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
-        NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
-
-        return binary_op(
-            source_converter_A(result_A),
-            source_converter_B(result_B)
-        );
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_a_op.end_row(row_idx);
-        visitor_b_op.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_a_op.end_step(step_idx);
-        visitor_b_op.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_a_op.end_epilogue();
-        visitor_b_op.end_epilogue();
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_broadcast.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_broadcast.h
@ -1,250 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with broadcasting vector to all columns
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementVector T[i][j] <- device-memory Td[i]
-///
-/// It can only be a leaf node in the epilogue tree
-template <
-    typename ElementAccumulator_,    ///< Data type of the Accumulator
-    typename ElementFragment_,       ///< Data type used to cache vector in register
-    typename InputTileIterator_      ///< Tile iterator type to read the broadcasted tensor
->
-class VisitorOpColumnBroadcast {
-public:
-    using InputTileIterator = InputTileIterator_;
-
-    static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementVector = typename InputTileIterator::Element;
-    using ElementFragment = ElementFragment_;
-
-    using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
-
-    /// Thread map used by input tile iterators
-    using ThreadMap = typename InputTileIterator::ThreadMap;
-
-    /// Fragment object used to store the broadcast values
-    using BroadcastFragment = Array<
-        ElementFragment, kElementsPerAccess>;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Used for the broadcast
-    struct BroadcastDetail {
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-        // /// Number of iterations (accesses) the threadblock takes to reduce a row
-        // static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-    };
-
-    // using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
-
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments():
-            broadcast_ptr(nullptr) { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementVector *broadcast_ptr,
-            int64_t batch_stride
-        ):
-            broadcast_ptr(broadcast_ptr),
-            batch_stride(batch_stride) { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            broadcast_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            broadcast_ptr(args.broadcast_ptr),
-            batch_stride(args.batch_stride) { }
-    };
-
-private:
-    ElementVector *broadcast_ptr;
-    BroadcastFragment broadcast_fragment;   ///< Array holds the loaded broadcast fragment
-    MatrixCoord threadblock_offset_;
-    int thread_idx_;
-    MatrixCoord problem_size;
-    
-    int thread_start_row_;
-    int state_[3];
-    int thread_offset_row_;
-
-    int64_t batch_stride_;
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpColumnBroadcast(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        broadcast_ptr(params.broadcast_ptr),
-        threadblock_offset_(threadblock_offset),
-        thread_idx_(thread_idx),
-        problem_size(problem_size),
-        thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
-        batch_stride_(params.batch_stride)
-    {
-        state_[0] = state_[1] = state_[2] = 0;
-    }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        broadcast_ptr += batch_idx * batch_stride_;
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() { }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {}
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {}
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        // get pointer
-        thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
-        
-        ElementFragment broadcast_data = ElementFragment(*(broadcast_ptr + thread_offset_row_));
-
-        broadcast_fragment.fill(broadcast_data);
-
-        return broadcast_fragment;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        // run operator ++
-        ++state_[0];
-
-        thread_start_row_ += ThreadMap::Shape::kRow;
-        if (state_[0] == ThreadMap::Count::kRow) {
-            state_[0] = 0;
-            ++state_[1];
-            thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
-                ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-            
-            if (state_[1] == ThreadMap::Count::kGroup) {
-                state_[1] = 0;
-                ++state_[2];
-                thread_start_row_ += ThreadMap::Count::kGroup *
-                    ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-                
-                if (state_[2] == ThreadMap::Count::kCluster) {
-                    state_[2] = 0;
-                }
-            }
-        }
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h
@ -1,341 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with reduction over columns in CTA
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementReductionAccumulator R[j] = \sum_i ElementReductionAccumulator(T[i][j])
-///  device memory <- ElementReduction(R[j])
-///
-template <
-    typename ThreadblockShape_,             /// Threadblock shape
-    typename ElementAccumulator_,           ///< Data type of the Accumulator
-    typename ElementReduction_,             ///< Data type of the output reduction in device memory
-    typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
-    typename OutputTileIterator_,           ///< Tile Iterator type
-    typename Visitor_                       ///< preceding visitor op
->
-class VisitorOpColumnReduction {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementReductionAccumulator = ElementReductionAccumulator_;
-    using ElementReduction = ElementReduction_;
-    using OutputTileIterator = OutputTileIterator_;
-    using ThreadblockShape = ThreadblockShape_;
-    using Visitor = Visitor_;
-
-    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-    using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
-    using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
-    using ElementOutput = typename OutputTileIterator::Element;
-
-    
-
-    /// Fragment type returned from Visitor
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisitor = typename VisitAccessTypeVisitor::Element;
-
-    using VisitAccessType = VisitAccessTypeVisitor;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type of reduction
-    using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
-
-    /// Thread map used by output tile iterators
-    using ThreadMap = typename OutputTileIterator::ThreadMap;
-    /// Used for the reduction
-    struct ReductionDetail {
-
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-        /// Number of iterations (accesses) the threadblock takes to reduce a row
-        static int const kThreadAccessesPerRow = const_max(1, (ThreadblockShape::kN + kThreadCount - 1) / kThreadCount);
-
-        using StorageShape = MatrixShape<
-            kThreadRows,
-            ThreadblockShape::kN
-        >;
-    };
-
-    using ReductionFragment = Array<ElementReductionAccumulator, ReductionDetail::kColumnsPerThread>;
-
-    /// Shared storage
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-        AlignedArray<ElementReductionAccumulator, ReductionDetail::StorageShape::kCount, 16> reduction;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Arguments visitor_arg;    ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Arguments(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementReduction *reduction_ptr,
-            int64_t batch_stride,
-            typename Visitor::Arguments visitor_arg
-        ):
-            reduction_ptr(reduction_ptr),
-            batch_stride(batch_stride),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Params visitor_param;     ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            reduction_ptr(args.reduction_ptr),
-            batch_stride(args.batch_stride),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    ElementReduction *reduction_output_ptr_;           ///< Pointer to the reduction tensor in device memory
-    ElementReductionAccumulator *reduction_smem_ptr_;  ///< Pointer to the partial reductions in shared memory
-    ReductionFragment reduction_fragment;              ///< register fragments that hold the partial reduction
-    Visitor visitor_;                                  ///< visitor
-    int thread_idx_;
-    MatrixCoord threadblock_offset;
-    MatrixCoord problem_size_;
-    int64_t batch_stride_;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpColumnReduction(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        visitor_(params.visitor_param, shared_storage.storage_visitor,
-            thread_idx, threadblock_offset, problem_size),
-        reduction_smem_ptr_(shared_storage.reduction.data()),
-        reduction_output_ptr_(params.reduction_ptr),
-        thread_idx_(thread_idx),
-        threadblock_offset(threadblock_offset),
-        problem_size_(problem_size),
-        batch_stride_(params.batch_stride)
-    { }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        reduction_output_ptr_ += batch_idx * batch_stride_;
-        visitor_.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_.begin_epilogue();
-        
-        // clear the reduction fragment
-        reduction_fragment.clear();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        visitor_.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        /// Get result from visitor
-        VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        NumericArrayConverter<ElementReductionAccumulator, ElementVisitor, kElementsPerAccess> reduction_converter;
-        ReductionOp reduction_op;
-        ReductionAccumulatorAccessType* reduction_fragment_ = reinterpret_cast<ReductionAccumulatorAccessType*>(&reduction_fragment);
-        reduction_fragment_[column_idx] = reduction_op(reduction_fragment_[column_idx], reduction_converter(result));
-
-        return result;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_.end_epilogue();
-         //
-        // Store the partially reduced value to SMEM
-        //
-
-        // Guard against uses of the existing SMEM tile
-        __syncthreads();
-
-        using AccessType = AlignedArray<ElementReductionAccumulator, ThreadMap::kElementsPerAccess>;
-
-        //
-        // Determine a compact thread arrangement to store to SMEM
-        //
-
-        MatrixCoord thread_offset(
-            thread_idx_ / ReductionDetail::kThreadsPerRow,
-            (thread_idx_ % ReductionDetail::kThreadsPerRow) * ThreadMap::kElementsPerAccess
-        );
-
-        //
-        // Each thread store its fragment to a SMEM
-        //
-        AccessType *aligned_reduction_ptr = reinterpret_cast<AccessType *>(
-            &reduction_smem_ptr_[thread_offset.row() * ThreadblockShape::kN + thread_offset.column()]
-        );
-
-        AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(
-            &reduction_fragment
-        );
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-            int col_idx = column * ThreadMap::Delta::kColumn / ThreadMap::kElementsPerAccess;
-
-            aligned_reduction_ptr[col_idx] = frag_ptr[column];
-        }
-
-        __syncthreads();
-
-        //
-        // Now, threads are assigned several columns of the output. The fetch over all rows from
-        // the compacted SMEM tile and perform a reduction.
-        //
-
-        NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int j = 0; j < ReductionDetail::kThreadAccessesPerRow; ++j) {
-            int column_idx = thread_idx_ + j * ReductionDetail::kThreadCount;
-
-            ReductionOpScalar reduction_op;
-            ElementReductionAccumulator reduction_element = ElementReductionAccumulator();
-
-            int output_column_idx = threadblock_offset.column() + column_idx;
-
-            if (column_idx < ThreadblockShape::kN && output_column_idx < problem_size_.column()) {
-                
-                CUTLASS_PRAGMA_UNROLL
-                for (int row = 0; row < ReductionDetail::kThreadRows; ++row) {
-                    if (row) {
-                        auto frag = reduction_smem_ptr_[row * ThreadblockShape::kN + column_idx];
-                        reduction_element = reduction_op(reduction_element, frag);
-                    }
-                    else {
-                        
-                        reduction_element = reduction_smem_ptr_[column_idx];
-                    }
-                }
-
-                // Store
-                reduction_output_ptr_[column_idx + threadblock_offset.column() + threadblock_offset.row() / ThreadblockShape::kM * problem_size_.column()] = output_converter(reduction_element);
-            }
-        }
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h
@ -1,266 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Linear Combination
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementCompute alpha;
-///  ElementCompute beta;
-///  ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B) 
-///  Return C;
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename ElementCompute_,      ///< Data type used to compute linear combination
-    int      kElementsPerAccess_,   ///< Number of elements computed per operation
-    typename VisitorA_,            ///< Child node A      
-    typename VisitorB_             ///< Child node B
->
-class VisitorOpLinearCombination{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementCompute = ElementCompute_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    using VisitorA = VisitorA_;
-    using VisitorB = VisitorB_;
-
-    /// Fragment type returned from VisitorA.visit
-    using VisitAccessTypeA = typename VisitorA::VisitAccessType;
-    using ElementA = typename VisitAccessTypeA::Element;
-
-    /// Fragment type returned from VisitorB.visit
-    using VisitAccessTypeB = typename VisitorB::VisitAccessType;
-    using ElementB = typename VisitAccessTypeB::Element;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = Array<ElementCompute, kElementsPerAccess>; 
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Combination Op
-    using CombinationOp = cutlass::plus<VisitAccessType>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
-    static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        typename VisitorA::SharedStorage storage_a;
-        typename VisitorB::SharedStorage storage_b;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        ElementCompute alpha;                         ///< scales accumulators
-        ElementCompute beta;                          ///< scales source tensor
-        typename VisitorA::Arguments visitor_a_arg;    ///< Argument type for visitor_a
-        typename VisitorB::Arguments visitor_b_arg;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Arguments():
-            alpha(ElementCompute(1)),
-            beta(ElementCompute(0))
-            { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementCompute alpha,
-            ElementCompute beta,
-            typename VisitorA::Arguments visitor_a_arg,
-            typename VisitorB::Arguments visitor_b_arg
-        ):
-            alpha(alpha),
-            beta(beta),
-            visitor_a_arg(visitor_a_arg),
-            visitor_b_arg(visitor_b_arg)
-        { }
-    };
-
-    /// Parameter structure
-    struct Params {
-        ElementCompute alpha;                         ///< scales accumulators
-        ElementCompute beta;                          ///< scales source tensor
-        typename VisitorA::Params visitor_a_param;    ///< Argument type for visitor_a
-        typename VisitorB::Params visitor_b_param;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Params() { }
-        
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            alpha(args.alpha),
-            beta(args.beta),
-            visitor_a_param(args.visitor_a_arg),
-            visitor_b_param(args.visitor_b_arg)
-        { }
-    };
-
-private:
-    //
-    // Data members
-    //
-
-    ElementCompute alpha_;
-    ElementCompute beta_;
-
-    VisitorA visitor_a_op;
-    VisitorB visitor_b_op;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpLinearCombination(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        alpha_(params.alpha),
-        beta_(params.beta),
-        visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
-        visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
-    { }
-
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.begin_epilogue();
-        if (beta_ != ElementCompute(0)) visitor_b_op.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.begin_step(step_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.begin_row(row_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) { 
-        /// Get result from visitor A and visitor B
-        VisitAccessTypeA result_A;
-        VisitAccessTypeB result_B;
-
-        if (alpha_ != ElementCompute(0)) {
-            result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        } else {
-            // Fill the result A with zeros
-            result_A.clear();
-        }
-
-        if (beta_ != ElementCompute(0)) {
-            result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        } else {
-            // Fill the result B with zeros
-            result_B.clear();
-        }
-
-        /// Type conversion
-        NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
-        NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
-
-        CombinationOp combination_op;
-
-        cutlass::multiplies<VisitAccessType> multiply_op;
-
-        return combination_op(
-            multiply_op(alpha_, source_converter_A(result_A)),
-            multiply_op(beta_, source_converter_B(result_B))
-        );
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.end_row(row_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.end_step(step_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.end_epilogue();
-        if (beta_ != ElementCompute(0)) visitor_b_op.end_epilogue();
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_broadcast.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_broadcast.h
@ -1,258 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with broadcasting vector to all rows
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementVector T[i][j] <- device-memory Td[j]
-///
-/// It can only be a leaf node in the epilogue tree
-template <
-    typename ElementAccumulator_,    ///< Data type of the Accumulator
-    typename ElementFragment_,       ///< Data type used to cache vector in register
-    typename InputTileIterator_      ///< Tile iterator type to read the broadcasted tensor
->
-class VisitorOpRowBroadcast {
-public:
-    using InputTileIterator = InputTileIterator_;
-
-    static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementVector = typename InputTileIterator::Element;
-    using ElementFragment = ElementFragment_;
-
-    using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
-
-    /// Thread map used by input tile iterators
-    using ThreadMap = typename InputTileIterator::ThreadMap;
-
-    /// Fragment object used to store the broadcast values
-    using BroadcastFragment = Array<
-        ElementFragment, 
-        ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Used for the broadcast
-    struct BroadcastDetail {
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-        // /// Number of iterations (accesses) the threadblock takes to reduce a row
-        // static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-    };
-
-    // using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
-
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments():
-            broadcast_ptr(nullptr) { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementVector *broadcast_ptr,
-            int64_t batch_stride
-        ):
-            broadcast_ptr(broadcast_ptr),
-            batch_stride(batch_stride) { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            broadcast_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            broadcast_ptr(args.broadcast_ptr),
-            batch_stride(args.batch_stride) { }
-    };
-
-private:
-    ElementVector *broadcast_ptr;
-    BroadcastFragment broadcast_fragment;   ///< Array holds the loaded broadcast fragment
-    MatrixCoord threadblock_offset_;
-    int thread_idx_;
-    MatrixCoord problem_size;
-    int64_t batch_stride_;
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpRowBroadcast(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        broadcast_ptr(params.broadcast_ptr + threadblock_offset.column()),
-        threadblock_offset_(threadblock_offset),
-        thread_idx_(thread_idx),
-        problem_size(problem_size),
-        batch_stride_(params.batch_stride) { }
-    
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        broadcast_ptr += batch_idx * batch_stride_;
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        // load broadcast fragment
-        load_broadcast_fragment_();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {}
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {}
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        VisitAccessType* broadcast_fragment_ = reinterpret_cast<VisitAccessType*>(&broadcast_fragment);
-        return broadcast_fragment_[column_idx];
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-
-private:
-
-  CUTLASS_DEVICE
-  void load_broadcast_fragment_() {
-
-    broadcast_fragment.clear();
-
-    // If no pointer is supplied, set with all zeros and avoid memory accesses
-    if (!broadcast_ptr) {
-      return;
-    }
-
-    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
-
-    int thread_column_idx = threadblock_offset_.column() + thread_initial_column;
-    broadcast_ptr += thread_initial_column;
-
-    NumericArrayConverter<ElementFragment, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
-    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
-    using AccessFragmentType = Array<ElementFragment, BroadcastDetail::kElementsPerAccess>;
-
-    AccessFragmentType *frag_ptr = reinterpret_cast<AccessFragmentType *>(&broadcast_fragment);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
-
-      AccessType loaded;
-
-      loaded.clear();
-
-      if (thread_column_idx < problem_size.column()) {
-        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
-      }
-
-      AccessFragmentType cvt = converter(loaded);
-      frag_ptr[j] = cvt;
-
-      thread_column_idx += ThreadMap::Delta::kColumn;
-      broadcast_ptr += ThreadMap::Delta::kColumn;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h
@ -1,319 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with reduction over rows in CTA
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "stdio.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementReductionAccumulator R[i] = \sum_i ElementReductionAccumulator(T[i][j])
-///  device memory <- ElementReduction(R[i])
-///
-template <
-    typename ThreadblockShape_,             /// Threadblock shape
-    typename ElementAccumulator_,           ///< Data type of the Accumulator
-    typename ElementReduction_,             ///< Data type of the output reduction in device memory
-    typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
-    typename OutputTileIterator_,           ///< Tile Iterator type
-    typename Visitor_                       ///< preceding visitor op
->
-class VisitorOpRowReduction {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementReductionAccumulator = ElementReductionAccumulator_;
-    using ElementReduction = ElementReduction_;
-    using OutputTileIterator = OutputTileIterator_;
-    using ThreadblockShape = ThreadblockShape_;
-    using Visitor = Visitor_;
-
-    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-    using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
-    using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
-    using ElementOutput = typename OutputTileIterator::Element;
-
-    /// Fragment type returned from Visitor
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisitor = typename VisitAccessTypeVisitor::Element;
-
-    using VisitAccessType = VisitAccessTypeVisitor;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type of reduction
-    using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
-
-    /// Thread map used by output tile iterators
-    using ThreadMap = typename OutputTileIterator::ThreadMap;
-    /// Used for the reduction
-    struct ReductionDetail {
-
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
-
-        /// Half number of threads per row used for cross-thread reduction
-        static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-    };
-
-    /// Shared storage
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Arguments visitor_arg;    ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Arguments(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementReduction *reduction_ptr,
-            int64_t batch_stride,
-            typename Visitor::Arguments visitor_arg
-        ):
-            reduction_ptr(reduction_ptr),
-            batch_stride(batch_stride),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Params visitor_param;     ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            reduction_ptr(args.reduction_ptr),
-            batch_stride(args.batch_stride),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    ElementReduction *reduction_output_ptr_;           ///< Pointer to the reduction tensor in device memory
-    ElementReductionAccumulator reduction_accum;
-    Visitor visitor_;                                  ///< visitor
-    int thread_idx_;
-    MatrixCoord threadblock_offset;
-    MatrixCoord problem_size_;
-
-    int thread_start_row_;                             /// used to identify
-    int state_[3];                                     /// used to track row iterator
-    int thread_offset_row_;                            
-    int64_t batch_stride_;
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpRowReduction(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        visitor_(params.visitor_param, shared_storage.storage_visitor,
-            thread_idx, threadblock_offset, problem_size),
-        reduction_output_ptr_(params.reduction_ptr),
-        thread_idx_(thread_idx),
-        threadblock_offset(threadblock_offset),
-        problem_size_(problem_size),
-        thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
-        batch_stride_(params.batch_stride)
-    {
-        state_[0] = state_[1] = state_[2] = 0;
-    }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        reduction_output_ptr_ += batch_idx * batch_stride_;
-        visitor_.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        visitor_.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_.begin_row(row_idx);
-
-        reduction_accum = ElementReductionAccumulator(0);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        /// Get result from visitor
-        VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
-
-        ReductionOpScalar reduction_op;
-
-        ElementReductionAccumulator reduction_accum_ = reduction(result);
-
-        // After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = ReductionDetail::kHalfThreadsPerRow; i > 0; i >>= 1) {
-            reduction_accum_ = reduction_op(reduction_accum_, __shfl_xor_sync(0xFFFFFFFF, reduction_accum_, i));
-        }
-        reduction_accum = reduction_op(reduction_accum, reduction_accum_);
-
-        return result;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_.end_row(row_idx);
-        NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
-
-        bool is_write_thread = (thread_offset_row_ < problem_size_.row() && (thread_idx_ % ReductionDetail::kThreadsPerRow) == 0);
-        int row_offset = thread_offset_row_ + threadblock_offset.column() / ThreadblockShape::kN * problem_size_.row();
-
-        ElementReduction *curr_ptr_reduction = reduction_output_ptr_ + row_offset;
-
-        arch::global_store<ElementReduction, sizeof(ElementReduction)>(
-            output_converter(reduction_accum),
-            (void *)curr_ptr_reduction,
-            is_write_thread);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_.end_step(step_idx);
-
-        // run operator ++
-        ++state_[0];
-
-        thread_start_row_ += ThreadMap::Shape::kRow;
-        if (state_[0] == ThreadMap::Count::kRow) {
-            state_[0] = 0;
-            ++state_[1];
-            thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
-                ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-            
-            if (state_[1] == ThreadMap::Count::kGroup) {
-                state_[1] = 0;
-                ++state_[2];
-                thread_start_row_ += ThreadMap::Count::kGroup *
-                    ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-                
-                if (state_[2] == ThreadMap::Count::kCluster) {
-                    state_[2] = 0;
-                }
-            }
-        }
-
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_.end_epilogue();
-    }
-
-private:
-
-    CUTLASS_DEVICE
-    ElementReductionAccumulator reduction(VisitAccessTypeVisitor const& result) {
-        ElementReductionAccumulator sum_ = ElementReductionAccumulator(0);
-
-        ReductionOpScalar reduction_op;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < VisitAccessTypeVisitor::kElements; ++i) {
-            sum_ = reduction_op(sum_, result[i]);
-        }
-
-        return sum_;
-    }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_input.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_input.h
@ -1,188 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Tensor Output
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementInput C <- device memory
-///
-/// It can only be a leaf node in the epilogue tree
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename InputTileIterator_    ///< Tile iterator type to read the tensor
->
-class VisitorOpTensorInput {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using InputTileIterator = InputTileIterator_;
-
-    static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
-    using ElementInput = typename InputTileIterator::Element;
-
-    using VisitAccessType = Array<ElementInput, kElementsPerAccess>;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementInput *input_ptr;                 ///< Pointer to the input tensor in device memory
-        int ldt;                                 ///< Leading dimension of the input tensor operand
-        int64_t batch_stride;                        ///< batch stride for batched GEMM
-        
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments(): input_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementInput *input_ptr,
-            int ldt, int64_t batch_stride
-        ):
-            input_ptr(input_ptr),
-            ldt(ldt),
-            batch_stride(batch_stride)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        typename InputTileIterator::Params params_input;
-        ElementInput *input_ptr;
-        int64_t batch_stride;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            input_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            params_input(args.ldt),
-            input_ptr(args.input_ptr),
-            batch_stride(args.batch_stride)
-        { }
-    };
-
-private:
-    InputTileIterator iterator_T_;
-    typename InputTileIterator::Fragment fragment_T_;
-    MatrixCoord problem_size;
-    int64_t batch_stride_;
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpTensorInput(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        iterator_T_(
-            InputTileIterator(
-                params.params_input,
-                params.input_ptr,
-                problem_size,
-                thread_idx,
-                threadblock_offset
-            )
-        ),
-        problem_size(problem_size),
-        batch_stride_(params.batch_stride) { }
-    
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() { }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        fragment_T_.clear();
-        iterator_T_.load(fragment_T_);
-        ++iterator_T_;
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        VisitAccessType source = reinterpret_cast<VisitAccessType *>(&fragment_T_)[frag_idx];
-        return source;
-    }
-
-     CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_output.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_output.h
@ -1,240 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Tensor Output
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "stdio.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementOutput T = ElementOutput(Visitor)
-///  T-> device memory
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename OutputTileIterator_,  ///< Tile iterator type to write the tensor
-    typename Visitor_              ///< Child visitor that produces the output tensor
->
-class VisitorOpTensorOutput {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using OutputTileIterator = OutputTileIterator_;
-
-    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-    using ElementOutput = typename OutputTileIterator::Element;
-
-    using Visitor = Visitor_;
-
-    /// Fragment type returned from Visitor
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisitor = typename VisitAccessTypeVisitor::Element;
-
-    using VisitAccessType = VisitAccessTypeVisitor;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type of output
-    using OutputAccessType = Array<ElementOutput, kElementsPerAccess>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
-
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementOutput *output_ptr;                 ///< Pointer to the output tensor in device memory
-        int ldt;                                   ///< Leading dimension of the output tensor operand
-        int64_t batch_stride;                      ///< batch stride
-        typename Visitor::Arguments visitor_arg;   ///< Argument type of visitor
-
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments(): output_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementOutput *output_ptr,
-            int ldt,
-            int64_t batch_stride,
-            typename Visitor::Arguments visitor_arg
-        ):
-            output_ptr(output_ptr),
-            ldt(ldt),
-            batch_stride(batch_stride),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        typename OutputTileIterator::Params params_output;
-        ElementOutput *output_ptr;
-        int64_t batch_stride;
-        typename Visitor::Params visitor_param;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            output_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            params_output(args.ldt),
-            output_ptr(args.output_ptr),
-            batch_stride(args.batch_stride),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    OutputTileIterator iterator_T_;
-    typename OutputTileIterator::Fragment fragment_T_;
-    MatrixCoord problem_size;
-    Visitor visitor_;
-    int64_t batch_stride_;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpTensorOutput(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        visitor_(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size),
-        iterator_T_(
-            OutputTileIterator(
-                params.params_output,
-                params.output_ptr,
-                problem_size,
-                thread_idx,
-                threadblock_offset
-            )
-        ),
-        problem_size(problem_size),
-        batch_stride_(params.batch_stride) { }
-    
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
-        visitor_.set_batch_index(batch_idx);
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        fragment_T_.clear();
-        visitor_.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        /// Get result from visitor
-        VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        // Column guard
-        MatrixCoord thread_offset_ = iterator_T_.thread_start() + OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
-        bool column_guard = (thread_offset_.column() < problem_size.column());
-
-        if (column_guard) {
-            NumericArrayConverter<ElementOutput, ElementVisitor, kElementsPerAccess> output_converter;
-            OutputAccessType &output = reinterpret_cast<OutputAccessType *>(&fragment_T_)[frag_idx];
-            output = output_converter(result);
-        }
-
-        return result;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_.end_step(step_idx);
-        iterator_T_.store(fragment_T_);
-        ++iterator_T_;
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_.end_epilogue();
-    }
-
-};
-
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h
@ -1,226 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Unary operation
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "unary_ops.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementCompute alpha;
-///  ElementCompute beta;
-///  ElementCompute C = UnaryOp(ElementCompute(Visitor)) 
-///  Return C;
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename ElementCompute_,      ///< Data type used to compute linear combination
-    int      kElementsPerAccess_,  ///< Number of elements computed per operation
-    typename Visitor_,              ///< Child node
-    template<typename T, int N> typename UnaryOp_
->
-class VisitorOpUnary{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementCompute = ElementCompute_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    using Visitor = Visitor_;
-
-    /// Fragment type returned from Visitor.visit
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisit = typename VisitAccessTypeVisitor::Element;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = Array<ElementCompute, kElementsPerAccess>; 
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Combination Op
-    using UnaryOp = UnaryOp_<ElementCompute, kElementsPerAccess>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        typename UnaryOp::Arguments unary_arg;
-        typename Visitor::Arguments visitor_arg;    ///< Argument type for visitor
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Arguments():unary_arg() { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            typename UnaryOp::Arguments unary_arg,
-            typename Visitor::Arguments visitor_arg
-        ):
-            unary_arg(unary_arg),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Parameter structure
-    struct Params {
-        typename UnaryOp::Params unary_param;
-        typename Visitor::Params visitor_param;    ///< Argument type for visitor
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Params():unary_param() { }
-        
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            unary_param(args.unary_arg),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    //
-    // Data members
-    //
-    UnaryOp unary_op;
-
-    Visitor visitor_op;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpUnary(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        unary_op(params.unary_param),
-        visitor_op(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size)
-    { }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        visitor_op.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        if (unary_op.guard()) visitor_op.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        if (unary_op.guard()) visitor_op.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        if (unary_op.guard()) visitor_op.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) { 
-        /// Get result from visitor A and visitor B
-        VisitAccessTypeVisitor result;
-
-        if (unary_op.guard()){
-            result = visitor_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        } else {
-            result.clear();
-        }
-
-        /// Type conversion
-        NumericArrayConverter<ElementCompute, ElementVisit, kElementsPerAccess> source_converter;
-
-        cutlass::multiplies<VisitAccessType> multiply_op;
-
-        return unary_op(source_converter(result));
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        if (unary_op.guard()) visitor_op.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        if (unary_op.guard()) visitor_op.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        if (unary_op.guard()) visitor_op.end_epilogue();
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_with_layernorm.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_with_layernorm.h
@ -1,480 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Epilogue visitor type used for partial computation of a layernorm operation
-
-    GemmLayernorm example =  GEMM0 with partial reduction fused in epilogue (EpilogueVisitorLayerNorm)
-                          +  lightweight full reduction kernel (ApplyFinalReduction)
-                          +  GEMM1 with elementwise operations fused in mainloop (GemmLayernormMainloopFusion)
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ThreadblockShape_,
-  int ThreadCount,
-  typename OutputTileIterator_,
-  typename AccumulatorTile_,
-  typename ElementAccumulator_,
-  typename ElementVariance_,
-  typename ElementMean_,
-  typename ElementLayernormCompute_,
-  typename ElementwiseFunctor_,
-  bool IsShiftedVariance_ = false
->
-class EpilogueVisitorLayerNorm {
-public:
-
-  using ElementVariance = ElementVariance_;
-  using ElementMean = ElementMean_;
-  using ElementLayernormCompute = ElementLayernormCompute_;
-
-  using AccumulatorTile = AccumulatorTile_;
-
-  using ThreadblockShape   = ThreadblockShape_;
-  static int const kThreadCount = ThreadCount;
-
-  using OutputTileIterator = OutputTileIterator_;
-  using ElementwiseFunctor = ElementwiseFunctor_;
-
-  static int const kIterations = OutputTileIterator::kIterations;
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-  static int const kRowIterations = OutputTileIterator::ThreadMap::Iterations::kRow;
-
-  static int const kThreads = OutputTileIterator::ThreadMap::kThreads;
-
-  static bool const kIsShiftedVariance = IsShiftedVariance_;
-
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  static int const kDeltaRow = OutputTileIterator::ThreadMap::Delta::kRow;
-
-  /// Array type used in Shift-K Layernorm
-  static int const kRowAccessCount = kIterations * kRowIterations;
-
-  using ConvertedShiftFragment = Array<ElementLayernormCompute, kRowAccessCount>;
-
-  // Conducts manual transpose externally (already supported) for column major
-  using LayoutOutput = cutlass::layout::RowMajor;
-
-  using ElementAccumulator = ElementAccumulator_;
-
-  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
-  using LayernormFragment = Array<ElementLayernormCompute, kElementsPerAccess>;
-  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
-  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
-
-  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
-  static int const kThreadsInColumn = kThreads / kThreadsPerRow;
-  static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
-
-  /// Argument structure
-  struct Arguments {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    ElementVariance                       *ptr_Variance;
-    ElementMean                           *ptr_Mean;
-    ElementOutput                         *ptr_Shifted_K;
-    MatrixCoord                           extent;
-
-    //
-    // Methods
-    //
-    Arguments():
-      ptr_Variance(nullptr),
-      ptr_Mean(nullptr),
-      ptr_Shifted_K(nullptr)
-    {
-
-    }
-
-    Arguments(
-      typename ElementwiseFunctor::Params   elementwise_,
-      ElementVariance                       *ptr_Variance,
-      ElementMean                           *ptr_Mean_,
-      ElementOutput                         *ptr_Shifted_K_ = nullptr,
-      MatrixCoord                           extent = MatrixCoord(0, 0)
-    ):
-      elementwise(elementwise_),
-      ptr_Variance(ptr_Variance),
-      ptr_Mean(ptr_Mean_),
-      ptr_Shifted_K(ptr_Shifted_K_),
-      extent(extent)
-    {
-
-    }
-  };
-
-  struct Params {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    ElementVariance                       *ptr_Variance;
-    ElementMean                           *ptr_Mean;
-    ElementOutput                         *ptr_Shifted_K;
-    MatrixCoord                           extent;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params():
-      ptr_Variance(nullptr),
-      ptr_Mean(nullptr)
-    {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args):
-      elementwise(args.elementwise),
-      ptr_Variance(args.ptr_Variance),
-      ptr_Mean(args.ptr_Mean),
-      ptr_Shifted_K(args.ptr_Shifted_K),
-      extent(args.extent)
-    {
-
-    }
-  };
-
-  /// Shared storage
-  struct SharedStorage {
-
-  };
-
-private:
-
-  Params const &                        params_;
-  SharedStorage &                       shared_storage_;
-  MatrixCoord                           extent_;
-  ElementwiseFunctor                    elementwise_;
-
-  OutputTileIterator                    iterator_C_;
-  OutputTileIterator                    iterator_D_;
-  typename OutputTileIterator::Fragment fragment_C_;
-  typename OutputTileIterator::Fragment fragment_D_;
-
-  ElementAccumulator                    alpha_;
-  ElementAccumulator                    beta_;
-  ConvertedShiftFragment                shift_k_frag_;
-
-  ElementLayernormCompute               accum_sum_square_;
-  ElementLayernormCompute               accum_sum_element_;
-  int                                   thread_idx_;
-
-  MatrixCoord                           thread_offset_;
-
-  gemm::GemmCoord                       threadblock_tile_offset_;
-
-public:
-
-  CUTLASS_DEVICE
-  EpilogueVisitorLayerNorm(
-    Params const &params,                                         ///< Parameters routed to the epilogue
-    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
-    MatrixCoord threadblock_offset,
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    OutputTileIterator destination_iterator,                      ///< Tile iterator for destination
-    OutputTileIterator source_iterator                            ///< Threadblock tile coordinate in GEMMM
-  ):
-    params_(params),
-    shared_storage_(shared_storage),
-    elementwise_(params.elementwise),
-    extent_(params.extent),
-    iterator_C_(source_iterator),
-    iterator_D_(destination_iterator),
-    threadblock_tile_offset_(threadblock_tile_offset),
-    thread_idx_(thread_idx)
-  {
-    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
-    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
-
-    if (beta_ == ElementAccumulator()) {
-      iterator_C_.clear_mask();
-    }
-  }
-
-  /// Helper to indicate split-K behavior
-  CUTLASS_DEVICE
-  void set_k_partition(
-    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
-    int split_k_slices) {                                         ///< Total number of split-K slices
-
-  }
-
-  /// Called to set the batch index
-  CUTLASS_DEVICE
-  void set_batch_index(int batch_idx) {
-
-  }
-
-  /// Called at the start of the epilogue just before iterating over accumulator slices
-  CUTLASS_DEVICE
-  void begin_epilogue() {
-
-    // If shift-K feature is enabled, we load shift-k fragment
-    // at the very beginning of an epilogue
-    if (kIsShiftedVariance && params_.ptr_Shifted_K != nullptr) {
-      shift_k_frag_.clear();
-      int thread_offset_row_base = iterator_D_.thread_start_row();
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
-        int step_offset = iter_idx * OutputTileIterator::Shape::kRow;
-        CUTLASS_PRAGMA_UNROLL
-        for (int rid = 0; rid < kRowIterations; ++rid) {
-          int row_step_offset = rid * kDeltaRow;
-          int row_offset = thread_offset_row_base + step_offset + row_step_offset;
-          bool is_load = (row_offset < extent_.row());  
-          shift_k_frag_[iter_idx * kRowIterations + rid] = load_shift_k_(row_offset, is_load);
-        }
-
-      }
-
-    }
-
-  }
-
-  /// Called at the start of one step before starting accumulator exchange
-  CUTLASS_DEVICE
-  void begin_step(int step_idx) {
-    fragment_D_.clear();
-
-    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      fragment_C_.clear();
-      iterator_C_.load(fragment_C_);
-      ++iterator_C_;
-    }
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void begin_row(int row_idx) {
-    /// set the accumulator to 0
-    accum_sum_element_ = ElementLayernormCompute(0);
-    accum_sum_square_ = ElementLayernormCompute(0);
-  }
-
-  /// Called after accumulators have been exchanged for each accumulator vector
-  CUTLASS_DEVICE
-  void visit(
-    int iter_idx,
-    int row_idx,
-    int column_idx,
-    int frag_idx,
-    AccumulatorFragment const &accum) {
-
-    using Mul = cutlass::multiplies<ElementLayernormCompute>;
-    using Minus = cutlass::minus<ElementLayernormCompute>;
-    using Exp   = cutlass::fast_exp_op<ElementLayernormCompute>;
-
-    Minus     minus;
-    Mul       mul;
-    Exp       exponential;
-
-    LayernormFragment result;
-
-    thread_offset_ =
-      iterator_D_.thread_start() +
-      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
-
-    NumericArrayConverter<ElementLayernormCompute, ElementOutput, kElementsPerAccess> source_converter;
-    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
-
-    bool column_guard = (thread_offset_.column() < extent_.column());
-
-    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      result = source_converter(elementwise_(accum));
-    }else{
-      result = source_converter(elementwise_(accum, source_vector));
-    }
-
-
-    ElementLayernormCompute inv_scalar = cutlass::constants::one<ElementLayernormCompute>() / ElementLayernormCompute(extent_.column());
-
-    // Fragment is cleared for non-reachable columns so no need to check against column guard
-    ElementLayernormCompute accum_sum_element_tmp = element_sum_accumulator_(result);
-
-    // Square sum is different. Non-reachable columns should've been computed for shift-k
-    // Otherwise we will incorrectly have some extra k^2 added into square sum.
-    ElementLayernormCompute accum_sum_square_tmp = ElementLayernormCompute(0);
-
-    if (column_guard) {
-      accum_sum_square_tmp = (kIsShiftedVariance) ? \
-                        square_sum_accumulator_(result, shift_k_frag_[iter_idx * kRowIterations + row_idx]) : \
-                        square_sum_accumulator_(result);
-    }
-
-    accum_sum_element_tmp *= inv_scalar;
-    accum_sum_square_tmp *= inv_scalar;
-
-    // After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = kHalfThreadsPerRow; i > 0; i >>= 1) {
-      accum_sum_element_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_element_tmp, i);
-      accum_sum_square_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_square_tmp, i);
-    }
-    accum_sum_element_ += accum_sum_element_tmp;
-    accum_sum_square_ += accum_sum_square_tmp;
-
-    // Convert to the output
-    NumericArrayConverter<ElementOutput, ElementLayernormCompute, kElementsPerAccess> output_converter;
-    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
-    output = output_converter(result);
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void end_row(int row_idx) {
-
-    using ConvertVarianceOutput = cutlass::NumericConverter<ElementVariance, ElementLayernormCompute>;
-    using ConvertMeanOutput = cutlass::NumericConverter<ElementMean, ElementLayernormCompute>;
-
-    ConvertVarianceOutput   convert_variance_output;
-    ConvertMeanOutput  convert_mean_output;
-
-    bool is_write_thread = (thread_offset_.row() < extent_.row() && (threadIdx.x % kThreadsPerRow) == 0);
-    int row_offset = thread_offset_.row() + threadblock_tile_offset_.n() * extent_.row();
-
-    ElementVariance *curr_ptr_sum_square = params_.ptr_Variance + row_offset;
-    ElementMean *curr_ptr_element_sum = params_.ptr_Mean + row_offset;
-
-    arch::global_store<ElementVariance, sizeof(ElementVariance)>(
-              convert_variance_output(accum_sum_square_),
-              (void *)curr_ptr_sum_square,
-              is_write_thread);
-
-    arch::global_store<ElementMean, sizeof(ElementMean)>(
-              convert_mean_output(accum_sum_element_),
-              (void *)curr_ptr_element_sum,
-              is_write_thread);
-  }
-
-  /// Called after all accumulator elements have been visited
-  CUTLASS_DEVICE
-  void end_step(int step_idx) {
-
-    iterator_D_.store(fragment_D_);
-    ++iterator_D_;
-  }
-
-  /// Called after all steps have been completed
-  CUTLASS_DEVICE
-  void end_epilogue() {
-
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute load_shift_k_(int row_offset, bool is_load) {
-    using ConvertShiftK = cutlass::NumericConverter<ElementLayernormCompute, ElementOutput>;
-    ConvertShiftK convert_shift_k;    
-    ElementOutput shift_k_val;
-
-    // Computes the address to load shift_k element
-    ElementOutput *curr_ptr_shift_k = params_.ptr_Shifted_K + row_offset;
-    // Conditionally loads from global memory
-    arch::global_load<ElementOutput, sizeof(ElementOutput)>(shift_k_val, (void *)curr_ptr_shift_k, is_load);
-    // Converts data type to return
-    ElementLayernormCompute converted_shift_k_val = convert_shift_k(shift_k_val);
-    
-    return converted_shift_k_val;
-  }
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum) {
-    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < LayernormFragment::kElements; ++i) {
-      auto accum_ = accum[i];
-      sum_ += accum_ * accum_;
-    }
-
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum, ElementLayernormCompute shift_k_val) {
-    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < LayernormFragment::kElements; ++i) {
-      auto accum_ = accum[i] - shift_k_val;
-      sum_ += accum_ * accum_;
-    }
-
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute element_sum_accumulator_(LayernormFragment const &accum) {
-    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < LayernormFragment::kElements; ++i) {
-      sum_ += accum[i];
-    }
-
-    return sum_;
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h
@ -1,77 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind gemm related enum types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/gemm/gemm.h"
-#include "host.h"
-
-namespace py = pybind11;
-
-void bind_gemm(py::module &m) {
-    //
-    // Enumerate types
-    // cutlass/gemm/gemm.h
-
-    py::enum_<cutlass::gemm::GemmUniversalMode>(m, "Mode")
-        .value("Gemm", cutlass::gemm::GemmUniversalMode::kGemm, "Ordinary GEMM & GEMM Split-K serial")
-        .value("GemmSplitKParallel", cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, "GEMM Split-K parallel")
-        .value("Batched", cutlass::gemm::GemmUniversalMode::kBatched, "Batched GEMM")
-        .value("Array", cutlass::gemm::GemmUniversalMode::kArray)
-        .value("Invalid", cutlass::gemm::GemmUniversalMode::kInvalid);
-    
-    /// GemmCoord is a structure that specifies a location within the coordinate space of a GEMM problem
-    py::class_<cutlass::gemm::GemmCoord>(m, "GemmCoord")
-        .def(py::init<int, int, int>())
-        .def("m", py::overload_cast<>(&cutlass::gemm::GemmCoord::m))
-        .def("n", py::overload_cast<>(&cutlass::gemm::GemmCoord::n))
-        .def("k", py::overload_cast<>(&cutlass::gemm::GemmCoord::k))
-        // get tensor coords
-        .def("mk", 
-            [](const cutlass::gemm::GemmCoord & problem_size) {
-                return cutlass::MatrixCoord(problem_size.mk());
-            })
-        .def("kn", 
-            [](const cutlass::gemm::GemmCoord & problem_size) {
-                return cutlass::MatrixCoord(problem_size.kn());
-            })
-        .def("mn", 
-            [](const cutlass::gemm::GemmCoord & problem_size) {
-                return cutlass::MatrixCoord(problem_size.mn());
-            });
-    
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_gemm_host_helper(host_submodule);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm_universal_with_visitor.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm_universal_with_visitor.h
@ -1,628 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmUniversalwithEpilogueVisitor {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueVisitor = typename Epilogue::Visitor;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename EpilogueVisitor::ElementOutput;
-  using LayoutC = typename EpilogueVisitor::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value,
-    128 / sizeof_bits<ElementB>::value
-  );
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase {
-
-    //
-    // Data members
-    //
-
-    typename EpilogueVisitor::Arguments epilogue_visitor;
-
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    typename LayoutA::Stride stride_a;
-    typename LayoutB::Stride stride_b;
-    typename LayoutC::Stride stride_c;
-    typename LayoutC::Stride stride_d;
-
-    typename LayoutA::Stride::LongIndex lda;
-    typename LayoutB::Stride::LongIndex ldb;
-    typename LayoutC::Stride::LongIndex ldc;
-    typename LayoutC::Stride::LongIndex ldd;
-
-    int const * ptr_gather_A_indices;
-    int const * ptr_gather_B_indices;
-    int const * ptr_scatter_D_indices;
-
-    //
-    // Methods
-    //
-
-    Arguments():
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
-      ptr_gather_A_indices(nullptr),
-      ptr_gather_B_indices(nullptr),
-      ptr_scatter_D_indices(nullptr) {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueVisitor::Arguments epilogue_visitor,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride stride_a,
-      typename LayoutB::Stride stride_b,
-      typename LayoutC::Stride stride_c,
-      typename LayoutC::Stride stride_d,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr
-    ):
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue_visitor(epilogue_visitor),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices) {
-      lda = 0;
-      ldb = 0;
-      ldc = 0;
-      ldd = 0;
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-      }
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueVisitor::Arguments epilogue_visitor,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::LongIndex lda,
-      typename LayoutB::Stride::LongIndex ldb,
-      typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr
-    ):
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue_visitor(epilogue_visitor),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices) {
-      stride_a = make_Coord(lda);
-      stride_b = make_Coord(ldb);
-      stride_c = make_Coord(ldc);
-      stride_d = make_Coord(ldd);
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-      }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.stride_a, args.stride_b);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
-
-      return args;
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC> {
-
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC>;
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename EpilogueVisitor::OutputTileIterator::Params params_C;
-    typename EpilogueVisitor::OutputTileIterator::Params params_D;
-
-    typename EpilogueVisitor::Params epilogue_visitor;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    int * ptr_gather_A_indices;
-    int * ptr_gather_B_indices;
-    int * ptr_scatter_D_indices;
-
-    int *semaphore;
-
-    //
-    // Methods
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      int device_sms,
-      int sm_occupancy
-    ):
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
-      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
-      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
-      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
-      epilogue_visitor(args.epilogue_visitor),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
-      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
-      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices)) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
-      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
-      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-
-      epilogue_visitor = args.epilogue_visitor;
-
-      semaphore = static_cast<int *>(workspace);
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-    typename EpilogueVisitor::SharedStorage visitor;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  GemmUniversalwithEpilogueVisitor() { }
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmUniversalwithEpilogueVisitor::can_implement()");
-
-    static int const kAlignmentA = (platform::is_same<LayoutA,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutA,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (platform::is_same<LayoutB,
-                                                      layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutB,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (platform::is_same<LayoutC,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutC,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A,
-      params.ptr_gather_A_indices);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B,
-      params.ptr_gather_B_indices);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    // EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // Tile iterator loading from source tensor.
-
-    EpilogueVisitor epilogue_visitor(
-        params.epilogue_visitor,
-        shared_storage.visitor,
-        threadblock_offset,
-        threadblock_tile_offset,
-        thread_idx,
-        params.problem_size.mn()
-    );
-
-    if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
-      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
-    }
-
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(epilogue_visitor, accumulators);
-
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/scripts/pycutlass/src/cpp/include/gemm/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/gemm/host.h
@ -1,47 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind gemm host helpers to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/util/host_reorder.h"
-#include "cutlass/layout/tensor.h"
-
-namespace py = pybind11;
-
-
-void bind_gemm_host_helper(py::module &m) {
-    m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::RowMajorInterleaved<32>>);
-    m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::ColumnMajorInterleaved<32>>);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/layout/layout.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/layout/layout.h
@ -1,47 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind CUTLASS layouts to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "tensor.h"
-#include "matrix.h"
-
-
-namespace py = pybind11;
-
-void bind_layout(py::module &m) {
-    bind_tensor_layout(m);
-    bind_matrix_layout(m);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/layout/matrix.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/layout/matrix.h
@ -1,87 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Matrix layouts to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/layout/matrix.h"
-
-namespace py = pybind11;
-
-void bind_matrix_layout(py::module &m) {
-    //
-    // Matrix layouts
-    // cutlass/layout/matrix.h
-    //
-
-    py::class_<cutlass::layout::RowMajor>(m, "RowMajor", R"pbdoc(
-        Mapping function for row-major matrices.
-    )pbdoc")
-        .def_static("packed", &cutlass::layout::RowMajor::packed, 
-            py::arg("extent"), 
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", [](const cutlass::layout::RowMajor & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-
-    py::class_<cutlass::layout::ColumnMajor>(m, "ColumnMajor", R"pbdoc(
-        Mapping function for column-major matrices.
-    )pbdoc")
-        .def_static("packed", &cutlass::layout::ColumnMajor::packed, 
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc" )
-        .def("stride", [](const cutlass::layout::ColumnMajor & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-
-    py::class_<cutlass::layout::RowMajorInterleaved<32>>(m, "RowMajorInterleaved32",
-        R"pbdoc(Mapping function for interleaved matrices. Matrix is structured 
-        as row-major arrangement of fixed-size columns 32)pbdoc")
-        .def_static("packed", &cutlass::layout::RowMajorInterleaved<32>::packed,
-            py::arg("extent"), 
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", [](const cutlass::layout::RowMajorInterleaved<32> & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-
-    py::class_<cutlass::layout::ColumnMajorInterleaved<32>>(m, "ColumnMajorInterleaved32",
-        R"pbdoc(Mapping function for interleaved matrices. Matrix is structured 
-        as column-major arrangement of fixed-size rows 32)pbdoc")
-        .def_static("packed", &cutlass::layout::ColumnMajorInterleaved<32>::packed,
-            py::arg("extent"), 
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", [](const cutlass::layout::ColumnMajorInterleaved<32> & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/layout/tensor.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/layout/tensor.h
@ -1,74 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Tensor layouts to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/layout/tensor.h"
-
-namespace py = pybind11;
-
-void bind_tensor_layout(py::module &m) {
-    //
-    // Tensor layouts
-    // cutlass/include/cutlass/layout/tensor.h
-    //
-
-    /// Mapping function for 4-D NHWC tensors.
-    py::class_<cutlass::layout::TensorNHWC>(m, "TensorNHWC",
-        R"pbdoc(Mapping function for 4-D NHWC tensors)pbdoc")
-        .def_static("packed", &cutlass::layout::TensorNHWC::packed,
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed NHWC tensor)pbdoc")
-        .def("stride", py::overload_cast<>(&cutlass::layout::TensorNHWC::stride),
-            R"pbdoc(Returns the stride of the layout)pbdoc");
-    
-    /// Mapping function for 4-D NC/xHWx tensors.
-    py::class_<cutlass::layout::TensorNCxHWx<32>>(m, "TensorNC32HW32",
-        R"pbdoc(Mapping function for 4-D NC/32HW32 tensors)pbdoc")
-        .def_static("packed", &cutlass::layout::TensorNCxHWx<32>::packed,
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", py::overload_cast<>(&cutlass::layout::TensorNCxHWx<32>::stride),
-            R"pbdoc(Returns the stride of the layout)pbdoc");
-    
-    /// Mapping function for 4-D CxRSKx tensors.
-    py::class_<cutlass::layout::TensorCxRSKx<32>>(m, "TensorC32RSK32",
-        R"pbdoc(Mapping function for 4-D C32RSK32 tensors)pbdoc")
-        .def_static("packed", &cutlass::layout::TensorCxRSKx<32>::packed,
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", py::overload_cast<>(&cutlass::layout::TensorCxRSKx<32>::stride),
-            R"pbdoc(Returns the stride of the layout)pbdoc");
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h
@ -1,159 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind threadblock swizzling to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/conv/threadblock/threadblock_swizzle.h"
-
-#include <cxxabi.h>
-#include <cuda_runtime.h>
-
-namespace py = pybind11;
-
-std::string demangle(const char* mangled_name) {
-    std::size_t len = 0;
-    int status = 0;
-    std::unique_ptr<char> ptr(
-                __cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status));
-    return ptr.get();
-}
-
-template<typename T>
-void bind_identity_swizzle(py::module & m, std::string name) {
-    py::class_<T>(m, name.c_str(),
-        R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc")
-        .def(py::init<>())
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-            
-            :param problem_size: gemm(M, N, K)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`
-            )pbdoc")
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-            
-            :param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
-            )pbdoc")
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv3dProblemSize&, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-            
-            :param problem_size: Implicit gemm problem size conv_operator(NZPQK, NDHWC, KTRSC)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
-            )pbdoc")
-        .def("get_grid_shape", &T::get_grid_shape,
-            py::arg("tiled_shape"), 
-            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
-        .def("tag", [](const T & swizzle){
-            return demangle(typeid(T).name());
-        }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
-}
-
-template<typename T>
-void bind_swizzle(py::module & m, std::string name, std::string doc) {
-    py::class_<T>(m, name.c_str(), doc.c_str())
-        .def(py::init<>())
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-            
-            :param problem_size: gemm(M, N, K)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`
-            )pbdoc")
-        .def("get_grid_shape", &T::get_grid_shape,
-            py::arg("tiled_shape"), 
-            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
-        .def("tag", [](const T & swizzle){
-            return demangle(typeid(T).name());
-        }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
-}
-
-template<typename T>
-void bind_dgrad_swizzle(py::module & m, std::string name) {
-    py::class_<T>(m, name.c_str(),
-        R"pbdoc(Threadblock swizzling function for strided dgrad convolution)pbdoc")
-        .def(py::init<>())
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-            
-            :param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
-            )pbdoc")
-        .def("get_grid_shape", [](const T & swizzle, cutlass::gemm::GemmCoord tiled_shape) {
-            return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
-        }, py::arg("tiled_shape"), 
-            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
-        .def("tag", [](const T & swizzle){
-            return demangle(typeid(T).name());
-        }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
-}
-
-void bind_threadblock_swizzle(py::module &m) {
-
-    py::class_<dim3>(m, "dim3",
-        R"pbdoc(A int3 type xyz contains three integers)pbdoc")
-        .def(py::init<int, int, int>(),
-            py::arg("x"), py::arg("y"), py::arg("z"))
-        .def_readwrite("x", &dim3::x, R"pbdoc(get value x)pbdoc")
-        .def_readwrite("y", &dim3::y, R"pbdoc(get value y)pbdoc")
-        .def_readwrite("z", &dim3::z, R"pbdoc(get value z)pbdoc");
-
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>>(m, "IdentitySwizzle1");
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>>(m, "IdentitySwizzle2");
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>>(m, "IdentitySwizzle4");
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>>(m, "IdentitySwizzle8");
-
-    bind_swizzle<cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle>(m, "HorizontalSwizzle",  R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc");
-    bind_swizzle<cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle>(m, "BatchedIdentitySwizzle",  R"pbdoc(Threadblock swizzling function for batched GEMMs)pbdoc");
-
-    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>>(m, "StridedDgradIdentitySwizzle1");
-    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>>(m, "StridedDgradIdentitySwizzle4");
-    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle>(m, "StridedDgradHorizontalSwizzle");
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/tensor_coord.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/tensor_coord.h
@ -1,78 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Tensor Coord to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/tensor_coord.h"
-
-namespace py = pybind11;
-
-void bind_tensor_coord(py::module &m) {
-    //
-    // Tensor Coords
-    // cutlass/include/cutlass/tensor_coord.h
-    //
-
-    /// Defines a canonical 4D coordinate used by tensor operations.
-    py::class_<cutlass::Tensor4DCoord>(m, "Tensor4DCoord",
-        R"pbdoc(Defines a canonical 4D coordinate used by tensor operations)pbdoc")
-        .def(py::init<int, int, int, int>(),
-            py::arg("n"), py::arg("h"), py::arg("w"), py::arg("c"),
-            R"pbdoc(Helper to construct from N, H, W, and C)pbdoc")
-        .def("at", py::overload_cast<int>(&cutlass::Tensor4DCoord::at),
-            py::arg("dim"),
-            R"pbdoc(Gets the index of a given Coord element)pbdoc")
-        .def("size", [](const cutlass::Tensor4DCoord & coord) {
-            return coord.at(0) * coord.at(1) * coord.at(2) * coord.at(3);},
-            R"pbdoc(The size of the tensor coord)pbdoc");
-    
-    py::class_<cutlass::Coord<3>>(m, "Tensor3DCoord",
-        R"pbdoc(Defines a canonical 3D coordinate used by tensor operations)pbdoc")
-        .def("at", py::overload_cast<int>(&cutlass::Coord<3>::at),
-            py::arg("dim"),
-            R"pbdoc(Gets the index of a given Coord element)pbdoc");
-
-    // Matrix Size
-    py::class_<cutlass::MatrixCoord>(m, "MatrixCoord",
-        R"pbdoc(MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
-        expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.)pbdoc")
-        .def(py::init<int, int>(),
-            py::arg("row"), py::arg("column"), R"pbdoc(Helper to construct from a row and column)pbdoc")
-        .def("row", py::overload_cast<>(&cutlass::MatrixCoord::row),
-            R"pbdoc(Returns the row of the coordinate)pbdoc")
-        .def("column", py::overload_cast<>(&cutlass::MatrixCoord::column),
-            R"pbdoc(Returns the column of the coordinate)pbdoc");
-
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h
@ -1,102 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind TensorRef and View to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "types.h"
-
-
-template<typename T, typename L, typename TF>
-void bind_tensor_ref_view(py::module &m, std::string name) {
-    py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
-        .def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
-            T* ptr = reinterpret_cast< T*>(address);
-            new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
-        })
-        .def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
-            T* ptr = tensor_ref.data();
-            return int64_t(ptr);
-        })
-        .def("layout", py::overload_cast<>(&cutlass::TensorRef<T, L>::layout));
-    
-    m.def("get_tensor_ref", [](int64_t address, TF data, const L& layout_) {
-        T* ptr = reinterpret_cast<T*>(address);
-        cutlass::TensorRef<T, L> tensor_ref = cutlass::TensorRef<T, L>(ptr, layout_);
-        return tensor_ref;
-    });
-    
-    py::class_<cutlass::TensorView<T, L>>(m, ("TensorView" + name).c_str())
-        .def(py::init<const cutlass::TensorRef<T, L>&, const typename L::TensorCoord &>());
-}
-
-
-void bind_tensor_refs_and_views(py::module &m) {
-
-    /// float
-    bind_tensor_ref_view<float, cutlass::layout::RowMajor, cutlass::float32>(m, "F32RowMajor");
-    bind_tensor_ref_view<float, cutlass::layout::ColumnMajor, cutlass::float32>(m, "F32ColumnMajor");
-    bind_tensor_ref_view<float, cutlass::layout::TensorNHWC, cutlass::float32>(m, "F32NHWC");
-
-    /// double
-    bind_tensor_ref_view<double, cutlass::layout::RowMajor, cutlass::float64>(m, "F64RowMajor");
-    bind_tensor_ref_view<double, cutlass::layout::ColumnMajor, cutlass::float64>(m, "F64ColumnMajor");
-    bind_tensor_ref_view<double, cutlass::layout::TensorNHWC, cutlass::float64>(m, "F64NHWC");
-
-    // half_t
-    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t>(m, "F16RowMajor");
-    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t>(m, "F16ColumnMajor");
-    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::TensorNHWC, cutlass::half_t>(m, "F16NHWC");
-
-    // bfloat16
-    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t>(m, "BF16RowMajor");
-    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::ColumnMajor, cutlass::bfloat16_t>(m, "BF16ColumnMajor");
-    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::TensorNHWC, cutlass::bfloat16_t>(m, "BF16NHWC");
-
-    // int8_t
-    bind_tensor_ref_view<int8_t, cutlass::layout::RowMajorInterleaved<32>, cutlass::int8>(m, "S8RowMajorInterleaved32");
-    bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajorInterleaved<32>, cutlass::int8>(m, "S8ColumnMajorInterleaved32");
-    bind_tensor_ref_view<int8_t, cutlass::layout::RowMajor, cutlass::int8>(m, "S8RowMajor");
-    bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajor, cutlass::int8>(m, "S8ColumnMajor");
-    bind_tensor_ref_view<int8_t, cutlass::layout::TensorNHWC, cutlass::int8>(m, "S8NHWC");
-    bind_tensor_ref_view<int8_t, cutlass::layout::TensorNCxHWx<32>, cutlass::int8>(m, "S8NC32HW32");
-    bind_tensor_ref_view<int8_t, cutlass::layout::TensorCxRSKx<32>, cutlass::int8>(m, "S8C32RSK32");
-
-    // int32_t
-    bind_tensor_ref_view<int32_t, cutlass::layout::RowMajor, cutlass::int32>(m, "S32RowMajor");
-    bind_tensor_ref_view<int32_t, cutlass::layout::ColumnMajor, cutlass::int32>(m, "S32ColumnMajor");
-    bind_tensor_ref_view<int32_t, cutlass::layout::TensorNHWC, cutlass::int32>(m, "S32NHWC");
-}
--- a/tools/library/scripts/pycutlass/src/cpp/include/types.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/types.h
@ -1,146 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind CUTLASS types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/half.h"
-
-
-namespace py = pybind11;
-
-namespace cutlass {
-
-/// IEEE 32-bit signed integer
-struct alignas(1) int8 {
-    int8_t storage;
-    explicit int8(int x) {
-        storage = int8_t(x);
-    }
-    explicit int8(float x) {
-        storage = int8_t(x);
-    }
-
-    int8_t c_value(){return storage;}
-};
-
-/// IEEE 32-bit signed integer
-struct alignas(4) int32 {
-    int storage;
-    explicit int32(int x) {
-        storage = x;
-    }
-    explicit int32(float x) {
-        storage = int(x);
-    }
-
-    int c_value(){return storage;}
-};
-/// IEEE single-precision floating-point type
-struct alignas(4) float32 {
-    float storage;
-    explicit float32(float x) {
-        storage = x;
-    }
-    explicit float32(int x) {
-        storage = float(x);
-    }
-    float c_value(){return storage;}
-};
-/// IEEE double-precision floating-point type
-struct alignas(4) float64 {
-    double storage;
-    explicit float64(float x) {
-        storage = double(x);
-    }
-    explicit float64(int x) {
-        storage = double(x);
-    }
-    double c_value(){return storage;}
-};
-}
-
-void bind_cutlass_types(py::module &m) {
-
-    // s8
-    py::class_<cutlass::int8>(m, "int8")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::int8::storage)
-        .def("value", &cutlass::int8::c_value);
-
-    // s32
-    py::class_<cutlass::int32>(m, "int32")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::int32::storage)
-        .def("value", &cutlass::int32::c_value);
-
-    // f16
-    py::class_<cutlass::half_t>(m, "float16")
-        .def(py::init<float>())
-        .def(py::init<double>())
-        .def(py::init<int>())
-        .def(py::init<unsigned>())
-        .def_readwrite("storage", &cutlass::half_t::storage)
-        .def("value", [](const cutlass::half_t& value) {return value;});
-    
-    // bf16
-    py::class_<cutlass::bfloat16_t>(m, "bfloat16")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::bfloat16_t::storage)
-        .def("value", [](const cutlass::bfloat16_t& value) {return value;});
-
-    // f32
-    py::class_<cutlass::float32>(m, "float32")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::float32::storage)
-        .def("value", &cutlass::float32::c_value);
-
-    // tf32
-    py::class_<cutlass::tfloat32_t>(m, "tfloat32")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::tfloat32_t::storage)
-        .def("value", [](const cutlass::tfloat32_t& value) {return value;});
-    
-    // f64
-    py::class_<cutlass::float64>(m, "float64")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::float64::storage)
-        .def("value", &cutlass::float64::c_value);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/library.h
+++ b/tools/library/scripts/pycutlass/src/cpp/library.h
@ -1,32 +0,0 @@
-#include <cutlass/complex.h>
-
-namespace cutlass {
-
-/// ENUM class for datatypes
-enum class DataType {
-    kB1, kU2, kU4, kU8,
-    kU16, kU32, kU64, kS2,
-    kS4, kS8, kS16, kS32,
-    kS64, kF16, kBF16, kF32,
-    kTF32, kF64, kCF16, kCBF16,
-    kCF32, kCTF32, kCF64, kCS2,
-    kCS4, kCS8, kCS16, kCS32, 
-    kCS64, kCU2, kCU4, kCU8,
-    kCU16, kCU32, kCU64, kInvalid
-};
-
-/// ENUM class for LayoutTypes
-enum class LayoutType {
-    kColumnMajor, kRowMajor,
-    kColumnMajorInterleaved2, kRowMajorInterleaved2,
-    kColumnMajorInterleaved32, kRowMajorInterleaved32,
-    kColumnMajorInterleaved64, kRowMajorInterleaved64,
-    kTensorNHWC, kTensorNDHWC, kTensorNCHW, kTensorNGHWC,
-    kTensorNC32HW32, kTensorNC64HW64, kTensorC32RSK32,
-    kTensorC64RSK64
-};
-
-/// ENUM class for opcode class
-
-
-} // namespace cutlass
--- a/tools/library/scripts/pycutlass/src/cpp/test/conv/conv_problems.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/conv/conv_problems.h
@ -1,54 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind convolution problems to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-
-#include "unit/conv/device/conv2d_problems.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-namespace py = pybind11;
-
-PYBIND11_MAKE_OPAQUE(std::vector<cutlass::conv::Conv2dProblemSize>);
-
-void bind_conv_problem_size_test(py::module &m) {
-    
-    py::bind_vector<std::vector<cutlass::conv::Conv2dProblemSize>>(m, "Conv2dProblemVector")
-        .def("size", &std::vector<cutlass::conv::Conv2dProblemSize>::size);
-    // Get Conv2d problem sizes
-    py::class_<test::conv::device::TestbedConv2dProblemSizes>(m, "TestbedConv2dProblemSizes")
-        .def(py::init<int>())
-        .def_readonly("conv2d_default_sizes", &test::conv::device::TestbedConv2dProblemSizes::conv2d_default_sizes);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/test/conv/convolution.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/conv/convolution.h
@ -1,49 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind convolution related types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "conv_problems.h"
-#include "host.h"
-
-namespace py = pybind11;
-
-void bind_convolution_test(py::module &m) {
-    // Conv problem sizes
-    bind_conv_problem_size_test(m);
-
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_conv_host_references(host_submodule);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/test/conv/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/conv/host.h
@ -1,180 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Convolution host test helpers to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-#include "unit/conv/device/cache_testbed_output.h"
-
-
-#include "cutlass/util/reference/host/convolution.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-
-namespace py = pybind11;
-
-
-template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
-void bind_conv2d_host(py::module &m) {
-    m.def("conv2d", \
-        &cutlass::reference::host::Conv2d< \
-            Ta, La, Tb, Lb, Tc, Lc, Te, Tacc>);
-    
-    m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
-}
-
-template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
-void bind_conv2d_host_sat(py::module &m) {
-    m.def("conv2d", \
-        &cutlass::reference::host::Conv2d< \
-            Ta, La, Tb, Lb, Tc, Lc, Te, Tacc, cutlass::NumericConverterClamp<Tc, Te>>);
-    
-    m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
-}
-
-template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
-void bind_conv2d_host_nhwc(py::module &m) {
-    bind_conv2d_host<
-        Ta, cutlass::layout::TensorNHWC, 
-        Tb, cutlass::layout::TensorNHWC, 
-        Tc, cutlass::layout::TensorNHWC, 
-        Tacc, Te>(m);
-}
-
-template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
-void bind_conv2d_host_nc32hw32(py::module &m) {
-    bind_conv2d_host_sat<
-        Ta, cutlass::layout::TensorNCxHWx<32>,
-        Tb, cutlass::layout::TensorCxRSKx<32>,
-        Tc, cutlass::layout::TensorNCxHWx<32>,
-        Tacc, Te>(m);
-}
-
-
-template<typename T, typename Layout>
-void bind_tensor_equals(py::module &m) {
-    m.def("equals", py::overload_cast<
-        const cutlass::TensorView<T, Layout>&, const cutlass::TensorView<T, Layout>&>(
-            &cutlass::reference::host::TensorEquals<T, Layout>
-        ));
-}
-
-#define BIND_TENSOR_HASH(Element, Layout) { \
-    m.def("TensorHash", &test::conv::device::TensorHash<Element, Layout>, py::arg("view"), py::arg("hash") = test::conv::device::CRC32(), py::arg("crc")=uint32_t()); \
-}
-
-void bind_conv_host_references(py::module &m) {
-    //
-    // Conv2d reference on host
-    // tools/util/include/cutlass/util/reference/host/convolution.h
-
-    /// double
-    bind_conv2d_host_nhwc<double, double, double, double, double>(m);
-    /// float
-    bind_conv2d_host_nhwc<float, float, float, float, float>(m);
-    /// half
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, float>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, float>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, float>(m);
-    /// bfloat16
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, cutlass::bfloat16_t>(m);
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, cutlass::bfloat16_t>(m);
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
-    /// s8
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    //
-    // Compare whether two tensors are equal
-    //
-    /// double
-    bind_tensor_equals<double, cutlass::layout::TensorNHWC>(m);
-    /// float
-    bind_tensor_equals<float, cutlass::layout::TensorNHWC>(m);
-    /// half
-    bind_tensor_equals<cutlass::half_t, cutlass::layout::TensorNHWC>(m);
-    /// bfloat16
-    bind_tensor_equals<cutlass::bfloat16_t, cutlass::layout::TensorNHWC>(m);
-    /// s32
-    bind_tensor_equals<int32_t, cutlass::layout::TensorNHWC>(m);
-    bind_tensor_equals<int32_t, cutlass::layout::TensorNCxHWx<32>>(m);
-    /// s8
-    bind_tensor_equals<int8_t, cutlass::layout::TensorNHWC>(m);
-    bind_tensor_equals<int8_t, cutlass::layout::TensorNCxHWx<32>>(m);
-
-    /// Cache
-    py::class_<test::conv::device::CachedTestKey>(m, "CachedTestKey")
-        .def(py::init<>())
-        .def(py::init<std::string, std::string, std::string, uint32_t, uint32_t, uint32_t>());
-    
-    py::class_<test::conv::device::CachedTestResult>(m, "CachedTestResult")
-        .def(py::init<>())
-        .def(py::init<uint32_t>())
-        .def_readwrite("D", &test::conv::device::CachedTestResult::D);
-    
-    py::class_<test::conv::device::CachedTestResultListing>(m, "CachedTestResultListing")
-        .def(py::init<const std::string &>())
-        .def("find", &test::conv::device::CachedTestResultListing::find)
-        .def("append", &test::conv::device::CachedTestResultListing::append)
-        .def("write", &test::conv::device::CachedTestResultListing::write);
-    
-    py::class_<test::conv::device::CRC32>(m, "CRC32")
-        .def(py::init<>());
-    
-    BIND_TENSOR_HASH(double, cutlass::layout::TensorNHWC)
-    BIND_TENSOR_HASH(float, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(cutlass::half_t, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(cutlass::bfloat16_t, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(int32_t, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(int8_t, cutlass::layout::TensorNCxHWx<32>);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/test/gemm/gemm.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/gemm/gemm.h
@ -1,45 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind gemm test to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "host.h"
-
-namespace py = pybind11;
-
-void bind_gemm_test(py::module &m) {
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_gemm_host_reference(host_submodule);
-}
--- a/tools/library/scripts/pycutlass/src/cpp/test/gemm/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/gemm/host.h
@ -1,431 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind gemm test host functions to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/util/reference/host/gemm.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include "cutlass/util/host_reorder.h"
-
-#include "cutlass/functional.h"
-
-namespace py = pybind11;
-
-
-template<
-    typename ElementA, typename LayoutA,
-    typename ElementB, typename LayoutB,
-    typename ElementC, typename LayoutC,
-    typename AccumulatorType, typename ComputeType, 
-    typename InnerProductOp>
-void bind_host_gemm_saturate(py::module &m) {
-    m.def("gemm_saturate", py::overload_cast<
-        cutlass::gemm::GemmCoord, ComputeType,
-        cutlass::TensorRef<ElementA, LayoutA>,
-        cutlass::TensorRef<ElementB, LayoutB>,
-        ComputeType,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        AccumulatorType>(
-            &cutlass::reference::host::compute_gemm<
-                        ElementA, LayoutA,
-                        ElementB, LayoutB,
-                        ElementC, LayoutC,
-                        ComputeType,
-                        AccumulatorType,
-                        InnerProductOp, 
-                        cutlass::NumericConverterClamp<ElementC, AccumulatorType>>
-                        ));
-}
-
-template<
-    typename ElementA, typename LayoutA,
-    typename ElementB, typename LayoutB,
-    typename ElementC, typename LayoutC,
-    typename AccumulatorType, typename ComputeType, 
-    typename InnerProductOp>
-void bind_host_gemm(py::module &m) {
-    m.def("gemm", py::overload_cast<
-        cutlass::gemm::GemmCoord, ComputeType,
-        cutlass::TensorRef<ElementA, LayoutA>,
-        cutlass::TensorRef<ElementB, LayoutB>,
-        ComputeType,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        AccumulatorType>(
-            &cutlass::reference::host::compute_gemm<
-                        ElementA, LayoutA,
-                        ElementB, LayoutB,
-                        ElementC, LayoutC,
-                        ComputeType,
-                        AccumulatorType,
-                        InnerProductOp, 
-                        cutlass::NumericConverter<ElementC, AccumulatorType>>
-                        ));
-}
-
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add(py::module &m) {
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add_saturate(py::module &m) {
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add_interleaved(py::module &m) {
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add_saturate_interleaved(py::module &m) {
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-#define BIND_TENSOR_EQUAL(Element, Layout) { \
-    m.def("equals", py::overload_cast< \
-        const cutlass::TensorView<Element, Layout>&, const cutlass::TensorView<Element, Layout>&>( \
-        &cutlass::reference::host::TensorEquals<Element, Layout>)); \
-}
-
-void bind_gemm_host_reference(py::module &m) {
-
-    /// double
-    bind_host_gemm_multiply_add<double, double, double, double, double>(m);
-    /// float
-    bind_host_gemm_multiply_add<float, float, float, float, float>(m);
-    /// half_t
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, float, float>(m);
-    /// bfloat16
-    bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
-    bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
-
-    /// s8
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    // float
-    BIND_TENSOR_EQUAL(float, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(float, cutlass::layout::ColumnMajor);
-
-    // double
-    BIND_TENSOR_EQUAL(double, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(double, cutlass::layout::ColumnMajor);
-
-    // half_t
-    BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::ColumnMajor);
-
-    // bfloat16
-    BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::ColumnMajor);
-
-    // int32_t
-    BIND_TENSOR_EQUAL(int32_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(int32_t, cutlass::layout::ColumnMajor);
-
-    // int8_t
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajor);
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajorInterleaved<32>);
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajorInterleaved<32>);
-    
-
-}
--- a/tools/library/scripts/pycutlass/src/pycutlass/init.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/init.py
@ -1,55 +0,0 @@
-import re
-
-
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-
-from pycutlass.type_hint import *
-from pycutlass.tensor_ref import *
-from pycutlass.operation import *
-from pycutlass.epilogue import *
-from pycutlass.parser import *
-from pycutlass.compiler import ArtifactManager
-from pycutlass.memory_manager import *
-from pycutlass.arguments import *
-from pycutlass.library import *
-from pycutlass.c_types import *
-from pycutlass.gemm_operation import *
-from pycutlass.conv2d_operation import *
-from pycutlass.compiler import *
-from pycutlass.utils import *
-from pycutlass.frontend import *
-from pycutlass.reduction_operation import *
-from pycutlass.compiler import *
-from pycutlass.utils.device import device_cc
-
-# module-wide variables
-
-import sys
-this = sys.modules[__name__]
-
-# artifact manager
-this.compiler = ArtifactManager()
-
-try:
-    if not hasattr(this, 'DEVICE_CC') or this.DEVICE_CC is None:
-        this.DEVICE_CC = device_cc()
-except:
-    this.DEVICE_CC = None
-
-def get_memory_pool(init_pool_size=0, max_pool_size=2**34):
-    this.memory_pool = PoolMemoryManager(
-        init_pool_size=init_pool_size,
-        max_pool_size=max_pool_size
-    )
-    return this.memory_pool
--- a/tools/library/scripts/pycutlass/src/pycutlass/arguments.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/arguments.py
@ -1,118 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-from .frontend import CupyFrontend
-from typeguard import typechecked
-from pycutlass.frontend import *
-from typing import Union
-import numpy as np
-from cuda import cuda
-try:
-    import torch
-    torch_available = True
-except ImportError:
-    torch_available = False
-from cuda import cudart
-try:
-    import cupy as cp
-    cupy_available = True
-except ImportError:
-    cupy_available = False
-
-
-# @typechecked
-class ArgumentBase:
-    """
-    Base class for operation arguments
-    """
-
-    def __init__(self,
-                 A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
-                 B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
-                 C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
-                 D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
-                 **kwargs) -> None:
-        
-        # tensor_C can be interpreted as the bias with bias=True in keyword args
-        if "bias" in kwargs.keys():
-            self.bias = kwargs["bias"]
-        else:
-            # by default, tensor_C is not bias
-            self.bias = False
-
-        # preprocessing input tensors
-        if isinstance(A, np.ndarray):
-            self.host_D = D
-            self.buffer_A = NumpyFrontend.argument(A, False)
-            self.buffer_B = NumpyFrontend.argument(B, False)
-            self.buffer_C = NumpyFrontend.argument(C, False)
-            self.buffer_D = NumpyFrontend.argument(D, True)
-            self.ptr_A = self.buffer_A.ptr
-            self.ptr_B = self.buffer_B.ptr
-            self.ptr_C = self.buffer_C.ptr
-            self.ptr_D = self.buffer_D.ptr
-            # number of elements in C
-            self.tensor_c_numel = C.size
-        elif torch_available and isinstance(A, torch.Tensor):
-            self.ptr_A = TorchFrontend.argument(A)
-            self.ptr_B = TorchFrontend.argument(B)
-            self.ptr_C = TorchFrontend.argument(C)
-            self.ptr_D = TorchFrontend.argument(D)
-            # number of elements in C
-            self.tensor_c_numel = C.numel()
-        elif isinstance(A, cuda.CUdeviceptr):
-            self.ptr_A = A
-            self.ptr_B = B
-            self.ptr_C = C
-            self.ptr_D = D
-            
-        elif cupy_available and isinstance(A, cp.ndarray):
-            self.ptr_A = CupyFrontend.argument(A)
-            self.ptr_B = CupyFrontend.argument(B)
-            self.ptr_C = CupyFrontend.argument(C)
-            self.ptr_D = CupyFrontend.argument(D)
-            # number of elements in C
-            self.tensor_c_numel = C.size
-        else:
-            raise TypeError(
-                "Unsupported Frontend. Only support numpy and torch")
-
-    def sync(self, stream_sync=True):
-        if stream_sync:
-            err, = cudart.cudaDeviceSynchronize()
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
-
-        if hasattr(self, "host_D"):
-            err, = cuda.cuMemcpyDtoH(
-                self.host_D, self.ptr_D, self.host_D.size * self.host_D.itemsize)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
--- a/tools/library/scripts/pycutlass/src/pycutlass/builder/collective_op_builder.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/builder/collective_op_builder.py
@ -1,395 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-"""
-Utilities for stamping out collective mainloops for SM90 kernels
-"""
-
-import cute
-import cutlass
-from pycutlass import SubstituteTemplate
-import pycutlass.library as library
-
-
-tma_alignment_bytes = 16
-cp_async_min_alignment_bytes = 4
-
-
-class RowColMajorToGMMAMajor:
-    @staticmethod
-    def A(layout, element):
-        """
-        Converts operand A's layout from row/column major format into CuTe's GMMA major format
-
-        :param layout: layout of the A operand
-        :type layout: cutlass.RowMajor or cutlass.ColumnMajor
-        :param element: data type of the A operand
-
-        :return: C++ CuTe GMMA major format
-        :rtype: cute.GMMAMajor
-        """
-        type_requires_k_major = (element == cutlass.tfloat32) or (element == cutlass.int8)
-        if layout == cutlass.ColumnMajor and not type_requires_k_major:
-            return cute.GMMAMajor.MN
-        else:
-            return cute.GMMAMajor.K
-
-    @staticmethod
-    def B(layout, element):
-        """
-        Converts operand B's layout from row/column major format into CuTe's GMMA major format
-
-        :param layout: layout of the B operand
-        :type layout: cutlass.RowMajor or cutlass.ColumnMajor
-        :param element: data type of the B operand
-
-        :return: C++ CuTe GMMA major format
-        :rtype: cute.GMMAMajor
-        """
-        type_requires_k_major = (element == cutlass.tfloat32) or (element == cutlass.int8)
-        if layout == cutlass.RowMajor and not type_requires_k_major:
-            return cute.GMMAMajor.MN
-        else:
-            return cute.GMMAMajor.K
-
-
-def cluster_shape_to_tma(dim):
-    """
-    Returns the TMA copy type for a given cluster dimension
-
-    :param dim: a given dimension of a cluster
-    :type dim: layout
-
-    :return: C++ TMA copy time
-    :rtype: str
-    """
-    return 'cute::SM90_TMA_LOAD' if dim == 1 else 'cute::SM90_TMA_LOAD_MULTICAST'
-
-
-def make_cpasync_gmem_tiled_copy(thread_count, element, alignment, gmma_layout, dim_mn, dim_k):
-    """
-    Returns a `make_tiled_copy` call for a given configuration
-
-    :param thread_count: number of threads in the threadblock
-    :type thread_count: int
-    :param element: datatype of the operand in question
-    :param alignment: byte alignment of the operand in question
-    :type alignment: int
-    :param gmma_layout: GMMA layout of the operand in question
-    :type gmma_layout: cute.GMMAMajor
-    :param dim_mn: extent of the M/N dimension of the tile
-    :type dim_mn: int
-    :param dim_k: extent of the reduction dimension of the tile
-    :type dim_k: int
-
-    :return: C++ call to `make_tiled_copy`
-    :rtype: str
-    """
-
-    emission_str = """decltype(cute::make_tiled_copy(
-        cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint_byte_t<static_cast<int>(sizeof(${element})) * ${alignment}>>, ${element}>{},
-        cute::Layout<cute::Shape<_${shape0_x}, _${shape0_y}>,
-                    cute::Stride<_${stride_x}, _${stride_y}>>{},
-        cute::Layout<cute::Shape<_${shape1_x}, _${shape1_y}>>{}))"""
-    if gmma_layout == cute.GMMAMajor.K:
-        threads_major = dim_k // alignment
-        threads_minor = thread_count // threads_major
-        values = {
-            'shape0_x': str(threads_minor),
-            'shape0_y': str(threads_major),
-            'stride_x': str(threads_major),
-            'stride_y': '1',
-            'shape1_x': '1',
-            'shape1_y': str(alignment)
-        }
-    elif gmma_layout == cute.GMMAMajor.MN:
-        threads_major = dim_mn // alignment
-        threads_minor = thread_count // threads_major
-        values = {
-            'shape0_x': str(threads_major),
-            'shape0_y': str(threads_minor),
-            'stride_x': '1',
-            'stride_y': str(threads_major),
-            'shape1_x': str(alignment),
-            'shape1_y': '1'
-        }
-    else:
-        raise Exception('Unexpected GMMA layout {}'.format(gmma_layout))
-
-    # Add common values
-    values['element'] = library.DataTypeTag[element]
-    values['alignment'] = str(alignment)
-    return SubstituteTemplate(emission_str, values)
-
-
-def max_stages(op, arch):
-    """
-    Returns the maximum number pipeline stages that can be used for an operation.
-
-    :param op: operation for which the maximum stages should be computed. If stages are
-               set via the `op.tile_description.stages` parameter, this setting is ignored
-               in the present calculation
-    :type op: pycutlass.GemmOperation
-    :param arch: compute capability of the device on which the operation will be run
-    :type arch: int
-
-    :return: maximum number of pipeline stages that can be used for an operation
-    :rtype: int
-    """
-    smem_per_stage = library.CalculateSmemUsagePerStage(op)
-    smem_capacity = library.SharedMemPerCC[arch]
-    return int(smem_capacity // smem_per_stage)
-
-
-class LayoutToStride:
-    _variable_first = 'cute::Stride<int64_t, cute::Int<1>, int64_t>'
-    _variable_last  = 'cute::Stride<cute::Int<1>, int64_t, int64_t>'
-
-    @staticmethod
-    def A(layout):
-        """
-        Returns the CuTe shape type corresponding to the layout of operand A
-
-        :param layout: layout of the B operand
-        :type layout: cutlass.RowMajor or cutlass.ColumnMajor
-
-        :return: C++ declaration of CuTe stride
-        :rtype: str
-        """
-        if layout == cutlass.RowMajor:
-            return LayoutToStride._variable_first
-        elif layout == cutlass.ColumnMajor:
-            return LayoutToStride._variable_last
-        else:
-            raise Exception('Unsupported layout {}'.format(layout))
-
-    @staticmethod
-    def B(layout):
-        """
-        Returns the CuTe shape type corresponding to the layout of operand B
-
-        :param layout: layout of the B operand
-        :type layout: cutlass.RowMajor or cutlass.ColumnMajor
-
-        :return: C++ declaration of CuTe stride
-        :rtype: str
-        """
-        if layout == cutlass.RowMajor:
-            return LayoutToStride._variable_last
-        elif layout == cutlass.ColumnMajor:
-            return LayoutToStride._variable_first
-        else:
-            raise Exception('Unsupported layout {}'.format(layout))
-
-
-EMISSION_STR = """
-using TileShape_MNK = cute::Shape<_${threadblock_shape_m}, _${threadblock_shape_n}, _${threadblock_shape_k}>;
-using ClusterShape_MNK = cute::Shape<_${cluster_shape_m}, _${cluster_shape_n}, _${cluster_shape_k}>;
-using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ${internal_element_A}, ${internal_element_B}, ${element_accumulator}, TileShape_MNK, ${gmma_layout_A}, ${gmma_layout_B}>()));
-
-using SmemLayoutAtomA = decltype(cute::GMMA::smem_selector<${gmma_layout_A}, ${internal_element_A}, _${threadblock_shape_m}, _${threadblock_shape_k}>());
-using SmemLayoutAtomB = decltype(cute::GMMA::smem_selector<${gmma_layout_B}, ${internal_element_B}, _${threadblock_shape_n}, _${threadblock_shape_k}>());
-
-using CollectiveOp = typename cutlass::gemm::collective::CollectiveMma<
-    ${mainloop_type}<${stage_count}, ClusterShape_MNK${kernel_schedule}>,
-    TileShape_MNK,
-    ${element_A},
-    ${stride_A},
-    ${element_B},
-    ${stride_B},
-    TiledMma,
-    ${gmem_tiled_copy_A},
-    SmemLayoutAtomA,
-    void, // GMMA_SS does not need an SmemCopyAtom
-    ${transform_A},
-    ${gmem_tiled_copy_B},
-    SmemLayoutAtomB,
-    void, // GMMA_SS does not need an SmemCopyAtom
-    ${transform_B}
->;
-"""
-
-
-def internal_element(element):
-    """
-    Returns the data type internally used for `element`.
-
-    :param element: data type
-
-    :return: data type used internally
-    """
-    return cutlass.tfloat32 if element == cutlass.float32 else element
-
-
-def common_values(op, stage_count, transform_A, transform_B):
-    """
-    Returns a dictionary containing common values to be substituted in the emission of the
-    collective operation declaration. Values specific to a particular collective operation
-    should be added to these.
-
-    :param op: GEMM operation for which to build a collective operation
-    :type op: pycutlass.GemmOperation
-    :param stage_count: number of pipeline stages to use in the operation
-    :type stage_count: int
-    :param transform_A: transformation to perform on the A operand
-    :type transform_A: str
-    :param transform_B: transformation to perform on the B operand
-    :type transform_B: str
-
-    :return: dictionary containing values to substitute in emission string
-    :rtype: dict
-    """
-    internal_element_a = internal_element(op.A.element)
-    internal_element_b = internal_element(op.B.element)
-
-    return {
-        'threadblock_shape_m': str(op.tile_description.threadblock_shape[0]),
-        'threadblock_shape_n': str(op.tile_description.threadblock_shape[1]),
-        'threadblock_shape_k': str(op.tile_description.threadblock_shape[2]),
-        'cluster_shape_m': str(op.tile_description.cluster_shape[0]),
-        'cluster_shape_n': str(op.tile_description.cluster_shape[1]),
-        'cluster_shape_k': str(op.tile_description.cluster_shape[2]),
-        'element_A': library.DataTypeTag[op.A.element],
-        'element_B': library.DataTypeTag[op.B.element],
-        'internal_element_A': library.DataTypeTag[internal_element_a],
-        'internal_element_B': library.DataTypeTag[internal_element_b],
-        'element_accumulator': library.DataTypeTag[op.accumulator_type()],
-        'gmma_layout_A': library.CuTeLayoutTag[RowColMajorToGMMAMajor.A(op.A.layout, internal_element_a)],
-        'gmma_layout_B': library.CuTeLayoutTag[RowColMajorToGMMAMajor.B(op.B.layout, internal_element_b)],
-        'stride_A': LayoutToStride.A(op.A.layout),
-        'stride_B': LayoutToStride.B(op.B.layout),
-        'stage_count': str(stage_count),
-        'transform_A': transform_A,
-        'transform_B': transform_B
-    }
-
-
-def build_gmma_tma(op):
-    """
-    Builds a collective operation declaration targeting TMA GMMA kernels
-
-    :param op: GEMM operation for which to build a collective operation
-    :type op: pycutlass.GemmOperation
-
-    :return: string containing the C++ declaration of collective operation
-    :rtype: str
-    """
-    A_tma_aligned = (library.DataTypeSizeBytes[op.A.element] * op.A.alignment) % tma_alignment_bytes == 0
-    B_tma_aligned = (library.DataTypeSizeBytes[op.B.element] * op.B.alignment) % tma_alignment_bytes == 0
-    if not A_tma_aligned or not B_tma_aligned:
-        raise Exception('Each of the A or B operands must be aligned to {} bytes to use TMA'.format(tma_alignment_bytes))
-
-    max_stage_count = max_stages(op, arch=90)
-    if op.tile_description.stages is None:
-        op.tile_description.stages = max_stage_count
-    elif op.tile_description.stages > max_stage_count:
-        raise Exception('Combination of threadblock shape, data types, and number of stages exceeds shared memory capacity.')
-
-    kernel_schedule = 'cutlass::gemm::KernelTmaWarpSpecialized'
-    if op.tile_description.persistent:
-        kernel_schedule = 'cutlass::gemm::KernelTmaWarpSpecializedPersistent'
-
-    transform_A = 'cute::identity'
-    transform_B = 'cute::identity'
-    values = common_values(op, op.tile_description.stages, transform_A, transform_B)
-    specific_values = {
-        'mainloop_type': 'cutlass::gemm::MainloopSm90TmaGmmaWarpSpecialized',
-        'kernel_schedule': ', ' + kernel_schedule,
-        'gmem_tiled_copy_A': cluster_shape_to_tma(op.tile_description.cluster_shape[1]),
-        'gmem_tiled_copy_B': cluster_shape_to_tma(op.tile_description.cluster_shape[0])
-    }
-    values.update(specific_values)
-
-    return SubstituteTemplate(EMISSION_STR, values)
-
-
-def build_gmma_cpasync(op):
-    """
-    Builds a collective operation declaration targeting cp.async GMMA kernels
-
-    :param op: GEMM operation for which to build a collective operation
-    :type op: pycutlass.GemmOperation
-
-    :return: string containing the C++ declaration of collective operation
-    :rtype: str
-    """
-    A_cp_async_aligned = (library.DataTypeSizeBytes[op.A.element] * op.A.alignment) % cp_async_min_alignment_bytes == 0
-    B_cp_async_aligned = (library.DataTypeSizeBytes[op.B.element] * op.B.alignment) % cp_async_min_alignment_bytes == 0
-    if not A_cp_async_aligned or not B_cp_async_aligned:
-        raise Exception('Each of the A or B operands must be aligned to {} bytes to use cp.async'.format(cp_async_min_alignment_bytes))
-
-    max_stage_count = max_stages(op, arch=90)
-    if op.tile_description.stages is None:
-        op.tile_description.stages = max_stage_count
-    elif op.tile_description.stages > max_stage_count:
-        raise Exception('Combination of threadblock shape, data types, and number of stages exceeds shared memory capacity.')
-
-    transform_A = 'cute::identity'
-    transform_B = 'cute::identity'
-
-    thread_count = 128
-    cpasync_copy_A = make_cpasync_gmem_tiled_copy(thread_count, op.A.element, op.A.alignment, RowColMajorToGMMAMajor.A(op.A.layout, op.A.element),
-                                                  op.tile_description.threadblock_shape[0], op.tile_description.threadblock_shape[2])
-    cpasync_copy_B = make_cpasync_gmem_tiled_copy(thread_count, op.B.element, op.B.alignment, RowColMajorToGMMAMajor.B(op.B.layout, op.B.element),
-                                                  op.tile_description.threadblock_shape[1], op.tile_description.threadblock_shape[2])
-
-    values = common_values(op, op.tile_description.stages, transform_A, transform_B)
-    specific_values = {
-        'mainloop_type': 'cutlass::gemm::MainloopSm90CpAsyncGmma',
-        'kernel_schedule': '',
-        'gmem_tiled_copy_A': cpasync_copy_A,
-        'gmem_tiled_copy_B': cpasync_copy_B
-    }
-    values.update(specific_values)
-
-    return SubstituteTemplate(EMISSION_STR, values)
-
-
-def build(operation):
-    """
-    Builds a collective operation declaration targeting cp.async or TMA for GMMA kernels
-
-    :param operation: GEMM operation for which to build a collective operation
-    :type operation: pycutlass.GemmOperation
-
-    :return: string containing the C++ declaration of collective operation
-    :rtype: str
-    """
-    A_tma_aligned = (library.DataTypeSizeBytes[operation.A.element] * operation.A.alignment) % tma_alignment_bytes == 0
-    B_tma_aligned = (library.DataTypeSizeBytes[operation.B.element] * operation.B.alignment) % tma_alignment_bytes == 0
-    tma_correct_size = (library.DataTypeSizeBytes[operation.A.element] == 2 and library.DataTypeSizeBytes[operation.B.element] == 2)
-    tma_correct_layout = (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor)
-    if A_tma_aligned and B_tma_aligned and (tma_correct_size or tma_correct_layout):
-        return build_gmma_tma(operation)
-    else:
-        return build_gmma_cpasync(operation)
--- a/tools/library/scripts/pycutlass/src/pycutlass/c_types.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/c_types.py
@ -1,279 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import ctypes
-from pycutlass.library import *
-
-
-class GemmCoord_(ctypes.Structure):
-    _fields_ = [
-        ("m", ctypes.c_int),
-        ("n", ctypes.c_int),
-        ("k", ctypes.c_int)
-    ]
-
-    def __init__(self, gemm_coord) -> None:
-        for field_name, _ in self._fields_:
-            setattr(self, field_name, getattr(gemm_coord, field_name)())
-
-
-class GemmCoordBatched_(ctypes.Structure):
-    """
-    Wrapper around a GemmCoord that also contains batch count. This is used for encoding
-    batched GEMM inputs to CUTLASS 3 GEMMs.
-    """
-    _fields_ = [
-        ("m", ctypes.c_int),
-        ("n", ctypes.c_int),
-        ("k", ctypes.c_int),
-        ("batch_count", ctypes.c_int)
-    ]
-
-    def __init__(self, gemm_coord, batch_count) -> None:
-        for field_name, _ in self._fields_[:-1]:
-            setattr(self, field_name, getattr(gemm_coord, field_name)())
-        setattr(self, "batch_count", batch_count)
-
-
-class MatrixCoord_(ctypes.Structure):
-    _fields_ = [
-        ("row", ctypes.c_int),
-        ("column", ctypes.c_int)
-    ]
-
-
-class dim3_(ctypes.Structure):
-    _fields_ = [
-        ("x", ctypes.c_int),
-        ("y", ctypes.c_int),
-        ("z", ctypes.c_int)
-    ]
-
-
-class StrideBatched_(ctypes.Structure):
-    """
-    CUTLASS 3.0 strides for operands contain one static dimension and two variable dimensions. The
-    variable dimensions represent the stride along non-unit-stride dimension of the row/column major
-    layout, and the batch stride. This structure encodes the two variable dimensions.
-    """
-    _fields_ = [
-        ("major_stride", ctypes.c_int64),
-        ("batch_stride", ctypes.c_int64)
-    ]
-
-
-dtype2ctype = {
-    cutlass.float16: ctypes.c_uint16,
-    cutlass.float32: ctypes.c_float,
-    cutlass.float64: ctypes.c_double,
-    cutlass.int32: ctypes.c_int32
-}
-
-
-def get_gemm_arguments_3x(epilogue_functor):
-
-    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
-
-    class _GemmArguments(ctypes.Structure):
-        _fields_ = [
-            ("mode", ctypes.c_int),
-            ("problem_size", GemmCoordBatched_),
-            ("ptr_A", ctypes.c_void_p),
-            ("stride_A", StrideBatched_),
-            ("ptr_B", ctypes.c_void_p),
-            ("stride_B", StrideBatched_),
-            ("ptr_C", ctypes.c_void_p),
-            ("stride_C", StrideBatched_),
-            ("ptr_D", ctypes.c_void_p),
-            ("stride_D", StrideBatched_),
-            ("epilogue", _EpilogueOutputOpParams),
-        ]
-
-    return _GemmArguments, _EpilogueOutputOpParams    
-
-
-def get_gemm_arguments(epilogue_functor):
-
-    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
-
-    class _GemmArguments(ctypes.Structure):
-        _fields_ = [
-            # Arguments from UniversalArgumentsBase
-            ("mode", ctypes.c_int),
-            ("problem_size", GemmCoord_),
-            ("batch_count", ctypes.c_int),
-            ("batch_stride_D", ctypes.c_longlong),
-            # Remaining arguments
-            ("epilogue", _EpilogueOutputOpParams),
-            ("ptr_A", ctypes.c_void_p),
-            ("ptr_B", ctypes.c_void_p),
-            ("ptr_C", ctypes.c_void_p),
-            ("ptr_D", ctypes.c_void_p),
-            ("batch_stride_A", ctypes.c_longlong),
-            ("batch_stride_B", ctypes.c_longlong),
-            ("batch_stride_C", ctypes.c_longlong),
-            ("stride_a", ctypes.c_longlong),
-            ("stride_b", ctypes.c_longlong),
-            ("stride_c", ctypes.c_longlong),
-            ("stride_d", ctypes.c_longlong),
-            ("lda", ctypes.c_longlong),
-            ("ldb", ctypes.c_longlong),
-            ("ldc", ctypes.c_longlong),
-            ("ldd", ctypes.c_longlong),
-            ("ptr_gather_A_indices", ctypes.c_void_p),
-            ("ptr_gather_B_indices", ctypes.c_void_p),
-            ("ptr_scatter_D_indices", ctypes.c_void_p)
-        ]
-
-    return _GemmArguments, _EpilogueOutputOpParams
-
-
-###########################################################################################
-# GEMM Grouped
-###########################################################################################
-
-def get_gemm_grouped_arguments(epilogue_functor):
-    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
-
-    class _GEMMGroupedArguments(ctypes.Structure):
-        _fields_ = [
-            ("problem_sizes", ctypes.c_void_p),
-            ("problem_count", ctypes.c_int),
-            ("threadblock_count", ctypes.c_int),
-            ("output_op", _EpilogueOutputOpParams),
-            ("ptr_A", ctypes.c_void_p),
-            ("ptr_B", ctypes.c_void_p),
-            ("ptr_C", ctypes.c_void_p),
-            ("ptr_D", ctypes.c_void_p),
-            ("lda", ctypes.c_void_p),
-            ("ldb", ctypes.c_void_p),
-            ("ldc", ctypes.c_void_p),
-            ("ldd", ctypes.c_void_p),
-            ("host_problem_sizes", ctypes.c_void_p)
-        ]
-
-    return _GEMMGroupedArguments, _EpilogueOutputOpParams
-
-############################################################################################
-# Convolution2D
-############################################################################################
-
-class Conv2DProblemSize(ctypes.Structure):
-    _fields_ = [
-        ("N", ctypes.c_int),
-        ("H", ctypes.c_int),
-        ("W", ctypes.c_int),
-        ("C", ctypes.c_int),
-        ("P", ctypes.c_int),
-        ("Q", ctypes.c_int),
-        ("K", ctypes.c_int),
-        ("R", ctypes.c_int),
-        ("S", ctypes.c_int),
-        ("pad_h", ctypes.c_int),
-        ("pad_w", ctypes.c_int),
-        ("stride_h", ctypes.c_int),
-        ("stride_w", ctypes.c_int),
-        ("dilation_h", ctypes.c_int),
-        ("dilation_w", ctypes.c_int),
-        ("mode", ctypes.c_int),  # kCrossCorrelation: 0, kConvolution: 1
-        ("split_k_slices", ctypes.c_int),
-        ("groups", ctypes.c_int)
-    ]
-
-    def __init__(self, problem_size) -> None:
-        for field_name, _ in self._fields_:
-            setattr(self, field_name, getattr(problem_size, field_name))
-
-
-class Layout4D(ctypes.Structure):
-    _fields_ = [
-        ("stride", ctypes.c_int * 3)
-    ]
-
-    def __init__(self, tensor_ref):
-        stride = tensor_ref.stride()
-        setattr(self, "stride", (stride.at(0), stride.at(1), stride.at(2)))
-
-
-class TensorRef_(ctypes.Structure):
-    _fields_ = [
-        ("ptr", ctypes.c_void_p),
-        ("layout", Layout4D)
-    ]
-
-    def __init__(self, tensor_ref):
-        setattr(self, "ptr", tensor_ref.data())
-        setattr(self, "layout", Layout4D(tensor_ref.layout()))
-
-
-class TensorRef2D_(ctypes.Structure):
-    _fields_ = [
-        ("ptr", ctypes.c_void_p),
-        ("stride", ctypes.c_int)
-    ]
-
-
-def get_conv2d_arguments(epilogue_functor):
-    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
-
-    class _Conv2dArguments(ctypes.Structure):
-        _fields_ = [
-            ("problem_size", Conv2DProblemSize),  # 0
-            ("ref_A", TensorRef_),  # 72
-            ("ref_B", TensorRef_),  # 96
-            ("ref_C", TensorRef_),  # 120
-            ("ref_D", TensorRef_),  # 144
-            ("output_op", _EpilogueOutputOpParams),  # 168
-            ("split_k_mode", ctypes.c_int)  # 192
-        ]
-
-    return _Conv2dArguments, _EpilogueOutputOpParams
-
-
-############################################################################################
-# Reduction
-############################################################################################
-
-def get_reduction_params(epilogue_functor):
-    _EpilogueOutputParams = epilogue_functor.epilogue_type
-
-    class _ReductionParams(ctypes.Structure):
-        _fields_ = [
-            ("problem_size", MatrixCoord_),
-            ("partitions", ctypes.c_int),
-            ("partition_stride", ctypes.c_longlong),
-            ("workspace", TensorRef2D_),
-            ("destination", TensorRef2D_),
-            ("source", TensorRef2D_),
-            ("output_op", _EpilogueOutputParams)
-        ]
-    return _ReductionParams, _EpilogueOutputParams
--- a/tools/library/scripts/pycutlass/src/pycutlass/compiler.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/compiler.py
@ -1,460 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import pycutlass
-from pycutlass import *
-import cutlass
-from cuda import cuda
-from cuda import nvrtc
-import tempfile
-import os
-import ctypes
-
-#
-import json
-import sqlite3
-
-
-IncludeTemplate = r'''#include "${include}"
-'''
-
-#
-
-
-class CompilationOptions:
-    '''
-    Compilation options.
-    '''
-
-    #
-    def __init__(self, flags, arch, include_paths=[]):
-        self.includes = []
-        self.include_paths = include_paths
-        self.flags = flags
-        self.arch = arch
-
-    def get_str(self):
-        options = ""
-
-        for flag in self.flags:
-            options += " " + flag
-
-        for incl in self.include_paths:
-            options += ' --include-path=%s' % incl
-
-        arch_flag = " -arch=sm_%d" % self.arch
-        if self.arch == 90:
-            arch_flag += 'a'
-        options += arch_flag
-
-        return options
-
-    #
-    def get(self):
-        options = []
-
-        for flag in self.flags:
-            options.append(bytes(str.encode(flag)))
-
-        for incl in self.include_paths:
-            options.append(bytes(str.encode('--include-path=%s' % incl)))
-
-        arch_flag = " -arch=sm_%d" % self.arch
-        if self.arch == 90:
-            arch_flag += 'a'
-
-        options.append(bytes(str.encode(arch_flag)))
-
-        return options
-
-
-def convertToBinaryData(filename):
-    with open(filename, 'rb') as file:
-        blobData = file.read()
-    return blobData
-
-
-def CDLLBin(host_binary):
-    tempfile.tempdir = "./"
-    temp_so = tempfile.NamedTemporaryFile(
-        prefix='host_func', suffix='.so', delete=True)
-    with open(temp_so.name, 'wb') as file:
-        file.write(host_binary)
-    host_lib = ctypes.CDLL(temp_so.name)
-    return host_lib
-
-
-class ArtifactManager:
-    """
-    Artifact manager
-    """
-
-    def __init__(self) -> None:
-        try:
-            connection = sqlite3.connect("./compiled_cache.db")
-            cursor = connection.cursor()
-            sqlite_create_table_query = """CREATE TABLE compiled_operations(op_key TEXT NOT NULL UNIQUE, cubin BLOB NOT NULL, hostbin BLOB NOT NULL, op_name TEXT NOT NULL, op_attrs TEXT NOT NULL)"""
-            cursor.execute(sqlite_create_table_query)
-            connection.commit()
-            cursor.close()
-        except:
-            pass
-
-        self.nvcc()
-        self.compiled_cache_device = cutlass.CompileCache()
-        self.compiled_cache_host = cutlass.CompileCache()
-    
-    def nvrtc(self):
-        self.backend = "nvrtc"
-        self.default_compile_options = [
-            '-std=c++17', '-default-device'
-        ]
-    def nvcc(self):
-        self.backend = "nvcc"
-        self.default_compile_options = [
-            '-std=c++17', '--expt-relaxed-constexpr', '-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored'
-        ]
-    def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
-        connection = sqlite3.connect("./compiled_cache.db")
-        cursor = connection.cursor()
-        sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
-
-        hostbin = convertToBinaryData(hostfile)
-
-        data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
-
-        cursor.execute(sqlite_insert_blob_query, data_tuple)
-        connection.commit()
-        cursor.close()
-
-    def load_operation(self, op_key, extra_funcs):
-        connection = sqlite3.connect("./compiled_cache.db")
-        cursor = connection.cursor()
-        sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
-        # try:
-        cursor.execute(sqlite_fetch_blob_query, (op_key, ))
-        record = cursor.fetchall()
-        if len(record) == 0:
-            return False
-        for row in record:
-            key, cubin_image, host_binary, operation_name, op_attr = row
-            op_attr = json.loads(op_attr)
-            err, module = cuda.cuModuleLoadData(cubin_image)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError('Cuda Error: {}'.format(err))
-
-            err, kernel = cuda.cuModuleGetFunction(
-                module, bytes(str.encode(operation_name)))
-            self.compiled_cache_device.insert(key, kernel)
-
-            compiled_host_fns = {}
-            host_lib = CDLLBin(host_binary)
-
-            func_name = operation_name + '_get_params'
-            func = getattr(host_lib, func_name)
-            func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
-            compiled_host_fns['get_args'] = func
-
-            func_name = operation_name + '_shared_memory_size'
-            func = getattr(host_lib, func_name)
-            compiled_host_fns['shared_memory_capacity'] = func()
-
-            for attr in op_attr:
-                if isinstance(attr, str):
-                    func_name = operation_name + '_' + attr
-                    func = getattr(host_lib, func_name)
-
-                    # Set the return type of the function
-                    if attr in extra_funcs and extra_funcs[attr] != None:
-                        func.restype = extra_funcs[attr]
-
-                    compiled_host_fns[attr] = func
-
-            self.compiled_cache_host.insert(key, compiled_host_fns)
-        return True
-
-    def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
-        """
-        Compile a list of kernels and store them into database
-        """
-        source_buffer_device = ""
-        source_buffer_host = ""
-        # 1. include
-        includes = []
-        for operation in operation_list:
-            for incl in operation.emitter.includes:
-                if incl not in includes:
-                    includes.append(incl)
-
-        includes_host = [
-            "builtin_types.h", "device_launch_parameters.h", "stddef.h"] + includes
-        for incl in includes:
-            source_buffer_device += SubstituteTemplate(
-                IncludeTemplate, {'include': incl})
-
-        for incl in includes_host:
-            if "/device/" not in incl:
-                source_buffer_host += SubstituteTemplate(
-                    IncludeTemplate, {'include': incl})
-
-        # 2. Operations
-        for operation in operation_list:
-            source_buffer_device += operation.emit()
-            source_buffer_host += operation.emit()
-            values = {
-                'operation_name': operation.name(),
-                'operation_suffix': operation.emitter.operation_suffix
-            }
-            source_buffer_device += SubstituteTemplate(
-                operation.KernelTemplate, values)
-            source_buffer_host += SubstituteTemplate(
-                operation.HostTemplate, values)
-
-        if self.backend == "nvrtc":
-            # 3. compile
-            err, program = nvrtc.nvrtcCreateProgram(
-                str.encode(source_buffer_device),
-                bytes(str.encode("module.cu")),
-                0, [], [])
-
-            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-                raise RuntimeError('NVRTC Error: {}'.format(err))
-
-            # Compile program
-            options = compilation_options.get()
-
-            err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
-            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-
-                error_string = 'NVRTC Error: {}\n'.format(err)
-
-                # Get log from compilation
-                err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
-                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-                    raise RuntimeError('NVRTC Error: {}'.format(err))
-
-                log = b' ' * logSize
-                err, = nvrtc.nvrtcGetProgramLog(program, log)
-                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-                    raise RuntimeError('NVRTC Error: {}'.format(err))
-
-                raise RuntimeError(
-                    error_string + log.decode() + source_buffer_device)
-
-            # Get data from compilation
-            err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
-            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-                raise RuntimeError('NVRTC Error: {}'.format(err))
-
-            cubin_image = b' ' * dataSize
-            err, = nvrtc.nvrtcGetCUBIN(program, cubin_image)
-            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-                raise RuntimeError('NVRTC Error: {}'.format(err))
-
-        else:  # with nvcc backend
-            # emit code
-            tempfile.tempdir = "./"
-            temp_cu = tempfile.NamedTemporaryFile(
-                prefix='kernel', suffix='.cu', delete=True)
-            temp_cubin = tempfile.NamedTemporaryFile(
-                prefix='kernel', suffix='.cubin', delete=True)
-            with open(temp_cu.name, 'w') as file:
-                file.write(source_buffer_device)
-
-            # compile with nvcc
-            cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
-            assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
-            cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
-            values = {
-                "cuda_install_path": cuda_install_path,
-                "options": compilation_options.get_str(),
-                "srcfile": temp_cu.name,
-                "tarfile": temp_cubin.name
-            }
-            cmd = SubstituteTemplate(cmd_template, values)
-            os.system(cmd)
-
-            # load the cubin image
-            with open(temp_cubin.name, 'rb') as file:
-                cubin_image = file.read()
-
-        # Set up the host-side library code
-        if requires_nvcc_hostlib_compilation:
-            cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
-            assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
-            cmd_template = "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}" % source_buffer_host
-            cmd = SubstituteTemplate(
-                cmd_template,
-                {
-                    "cuda_install_path": cuda_install_path,
-                    "options": compilation_options.get_str()
-                })
-        else:
-            options = compilation_options.get()
-            cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
-            filtered_opts = ['-default-device', '-Xcicc', '-Xllc', '--expt-relaxed-constexpr', '-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored']
-            for opt in options:
-                opt = opt.decode("utf-8")
-                if opt not in filtered_opts and '-arch=sm_' not in opt:
-                    if '--include-path=' in opt:
-                        cmd += " " + opt.replace('--include-path=', '-I')
-                    else:
-                        cmd += " " + opt
-
-        tempfile.tempdir = "./"
-        temp = tempfile.NamedTemporaryFile(
-            prefix='host_func', suffix='.so', delete=True)
-
-        cmd += ' - -shared -o %s -lcudart -lcuda' % temp.name
-        os.system(cmd)
-        host_lib = ctypes.CDLL(temp.name)
-
-        return cubin_image, host_lib, temp
-
-    def add_module(self, operations, compile_options=None):
-        """
-        Insert a new compiled device module
-        """
-        if compile_options is None:
-            cutlass_path = os.getenv('CUTLASS_PATH')
-            assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
-            cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
-            assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
-            include_paths = [
-                cuda_install_path + '/include',
-                cutlass_path + '/include',
-                cutlass_path + '/tools/util/include',
-                cutlass_path + '/tools/library/scripts/pycutlass/src/cpp/include'
-            ]
-
-            if pycutlass.DEVICE_CC is not None:
-                arch = pycutlass.DEVICE_CC
-            else:
-                # Find the maximum arch tag among the provided operations and compile for that target.
-                # Since we are compiling to .cubin files, only one architecture may be specified.
-                arch = max([op.arch for op in operations])
-            compile_options = CompilationOptions(
-                self.default_compile_options, arch, include_paths)
-        # save the cubin
-        operation_key = []
-        operation_list = []
-        requires_nvcc_hostlib_compilation = False
-        for operation in operations:
-            # step 1: get kernel string as key
-            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
-            # step 1: check if the operation is in cache
-            compiled_kernel = self.compiled_cache_device.at(key)
-
-            if compiled_kernel is None:
-                hit = self.load_operation(key, getattr(operation.rt_module, 'extra_funcs', {}))
-                if hit:
-                    compiled_kernel = self.compiled_cache_device.at(key)
-                    assert compiled_kernel is not None
-            if compiled_kernel is not None:
-                operation.rt_module.kernel = compiled_kernel
-                compiled_host_fns = self.compiled_cache_host.at(key)
-                assert compiled_host_fns is not None
-                for key in compiled_host_fns.keys():
-                    setattr(operation.rt_module, key, compiled_host_fns[key])
-                operation.rt_module.initialize()
-            else:
-                operation_list.append(operation.rt_module)
-                operation_key.append(key)
-
-            # Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
-            # the PyCUTLASS host-side library. Otherwise, g++ will be used.
-            if isinstance(operation, pycutlass.gemm_operation.GemmOperationUniversal) and operation.api == pycutlass.library.ApiVersion.v3x:
-                if self.backend == "nvrtc":
-                    raise RuntimeError('CUTLASS 3 kernels currently require NVCC for compilation.')
-
-                requires_nvcc_hostlib_compilation = True
-
-        if len(operation_list) > 0:
-            cubin_image, host_lib, host_file = self.emit_compile_(
-                operation_list, compile_options, requires_nvcc_hostlib_compilation)
-
-            err, module = cuda.cuModuleLoadData(cubin_image)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError('Cuda Error: {}'.format(err))
-
-            operation_name = []
-            operation_attr = []
-            for operation, key in zip(operation_list, operation_key):
-                # get device kernels
-                err, operation.kernel = cuda.cuModuleGetFunction(
-                    module,
-                    bytes(str.encode(operation.name()))
-                )
-                operation_name.append(operation.name())
-                self.compiled_cache_device.insert(key, operation.kernel)
-                # get host functions
-                compiled_host_fns = {}
-                op_attr = []
-
-                # get param size
-                func_name = operation.name() + '_get_param_size'
-                func = getattr(host_lib, func_name)
-                param_size = func()
-
-                func_name = operation.name() + '_get_params'
-                func = getattr(host_lib, func_name)
-                func.argtype = operation.argtype
-                func.restype = ctypes.POINTER(ctypes.c_char * param_size)
-                setattr(operation, 'get_args', func)
-                compiled_host_fns['get_args'] = func
-
-                # set shared memory size
-                func_name = operation.name() + '_shared_memory_size'
-                func = getattr(host_lib, func_name)
-                setattr(operation, 'shared_memory_capacity', func())
-                compiled_host_fns['shared_memory_capacity'] = func()
-                # set the maximum dynamic shared size
-                operation.initialize()
-
-                # get extra functions
-                op_attr.append(param_size)
-
-                if hasattr(operation, "extra_funcs"):
-                    for suffix, ret_type in operation.extra_funcs.items():
-                        func_name = operation.name() + '_' + suffix
-                        func = getattr(host_lib, func_name)
-                        if ret_type is not None:
-                            func.restype = ret_type
-                        setattr(operation, suffix, func)
-                        compiled_host_fns[suffix] = func
-                        op_attr.append(suffix)
-
-                operation_attr.append(op_attr)
-                self.compiled_cache_host.insert(key, compiled_host_fns)
-
-            for key, operation_name, operation_attr in zip(operation_key, operation_name, operation_attr):
-                self.insert_operation(
-                    key, cubin_image, host_file.name, operation_name, operation_attr)
--- a/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py
@ -1,632 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-from typeguard import typechecked
-from cuda import cuda
-from typing import Union
-import numpy as np
-
-from typeguard import typechecked
-
-from pycutlass import *
-
-
-# @typechecked
-class Conv2dArguments(ArgumentBase):
-    """
-    Argument wrapper for Conv2d. It encodes problem information and 
-    user-provide tensors into the kernel's argument.
-
-    :param operation: the Conv2d operation to take the argument
-    :type operation: :class:`pycutlass.Conv2dOperation`
-
-    :param problem_size: the Conv2d problem size
-    :type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
-
-    :param A: tensor A
-    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
-
-    :param B: tensor B
-    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
-
-    :param C: tensor C
-    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
-
-    :param D: tensor D
-    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
-
-    :param split_k_mode: conv2d split K mode, defaults to 
-    cutlass.conv.SplitKMode.Serial
-    :type split_k_mode: cutlass.conv.SplitKMode, optional
-
-    :param output_op: output operator, optional
-    :type output_op: :class:`pycutlass.LinearCombinationFunctorArguments`
-
-    """
-
-    def __init__(self, operation: 'Conv2dOperation',
-                 problem_size: 'cutlass.conv.Conv2dProblemSize',
-                 A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
-                 B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
-                 C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
-                 D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
-                 split_k_mode: 'cutlass.conv.SplitKMode'
-                    = cutlass.conv.SplitKMode.Serial, **kwargs) -> None:
-
-        self.operation = operation
-        #: convolution kind
-        self.conv_kind: cutlass.conv.Operator = operation.conv_kind
-        self.layout_A: cutlass.layout = operation.A.layout
-        self.layout_B: cutlass.layout = operation.B.layout
-        self.layout_C: cutlass.layout = operation.C.layout
-
-        self.element_A = operation.A.element
-        self.element_B = operation.B.element
-        self.element_C = operation.C.element
-
-        if self.layout_C == cutlass.TensorNC32HW32:
-            B = self.reorder_tensor_B(B, problem_size)
-
-        super().__init__(A, B, C, D, **kwargs)
-        # preprocessing output ops
-        
-        if 'output_op' in kwargs.keys() and \
-            split_k_mode != cutlass.conv.SplitKMode.Parallel:
-            self.output_op = kwargs['output_op']
-        else:
-            self.output_op = self.operation.epilogue_type(1.0, 0.0)
-
-        if "split_k_slices" in kwargs.keys():
-            self.split_k_mode = split_k_mode
-            self.split_k_slices = kwargs["split_k_slices"]
-        else:
-            self.split_k_mode = cutlass.conv.SplitKMode.Serial
-            self.split_k_slices = 1
-
-        #: problem_size
-        self.problem_size: cutlass.conv.Conv2dProblemSize = problem_size
-        self.problem_size.split_k_slices = self.split_k_slices
-
-        if hasattr(self, "tensor_c_numel"):
-            c_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
-                self.conv_kind, problem_size)
-            if (self.tensor_c_numel == c_coord.at(3) and 
-                self.tensor_c_numel < c_coord.size()):
-                self.bias = True
-
-        #
-        # initialize the argument
-        #
-        self.initialize()
-
-    # @typechecked
-    def reorder_tensor_B(self, tensor_B: 'np.ndarray', 
-            problem_size: 'cutlass.conv.Conv2dProblemSize'):
-        """
-        Reorder tensor_B for interleaved layout
-
-        :param tensor_B: input tensor B
-        :type tensor_B: numpy.ndarray
-        :param problem_size: Conv2d problem size
-        :type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
-
-        :return: reordered tensor B
-        :rtype: numpy.ndarray
-        """
-        reordered_tensor_B = np.empty_like(tensor_B)
-        tensor_ref_B = self.get_tensor_ref(
-            tensor_B, self.element_B, self.layout_B, problem_size, "b")
-        reordered_tensor_ref_B = self.get_tensor_ref(
-            reordered_tensor_B, self.element_B, 
-            self.layout_B, problem_size, "b")
-        cutlass.conv.host.reorder_convK(
-            reordered_tensor_ref_B, tensor_ref_B, self.conv_kind, problem_size)
-
-        return reordered_tensor_B
-
-    def get_tensor_ref(
-        self, tensor, dtype, tensor_layout, problem_size, operand):
-        if operand == "a":
-            tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(
-                self.conv_kind, problem_size)
-        elif operand == "b":
-            tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(
-                self.conv_kind, problem_size)
-        elif operand in ["c", "d"]:
-            tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
-                self.conv_kind, problem_size)
-        else:
-            raise ValueError("unknown operand: " + operand)
-        # Zero stride trick
-        if operand == "c" and self.bias:
-            tensor_coord = cutlass.Tensor4DCoord(0, 0, 0, 0)
-
-        layout = tensor_layout.packed(tensor_coord)
-
-        return TensorRef(tensor, dtype, layout).tensor_ref
-
-    def get_arguments(self, semaphore):
-        ref_A = TensorRef_(self.get_tensor_ref(
-            self.ptr_A, self.element_A, self.layout_A, self.problem_size, "a"))
-        ref_B = TensorRef_(self.get_tensor_ref(
-            self.ptr_B, self.element_B, self.layout_B, self.problem_size, "b"))
-        ref_C = TensorRef_(self.get_tensor_ref(
-            self.ptr_C, self.element_C, self.layout_C, self.problem_size, "c"))
-        ref_D = TensorRef_(self.get_tensor_ref(
-            self.ptr_D, self.element_C, self.layout_C, self.problem_size, "d"))
-
-        self.c_arguments = self.operation.argument_type(
-            Conv2DProblemSize(self.problem_size),
-            ref_A, ref_B, ref_C, ref_D, self.output_op, self.split_k_mode
-        )
-
-        self.semaphore = semaphore
-
-    def initialize(self):
-        """
-        Initialize the kernel arguments handling following stuffs
-        1. get kernel launch configuration including grid, cta size, 
-           and dynamic shared memory capacity
-        2. allocate and initialize device workspace
-        3. get kernel params as bytearray for NVRTC input
-        """
-        # get launch configuration
-        self.launch_config = self.operation.rt_module.plan(self)
-
-        # allocate and initialize device workspace
-        device_workspace_size = \
-            self.operation.rt_module.get_device_workspace_size(self)
-
-        if device_workspace_size > 0:
-            self.workspace_buffer = device_mem_alloc(device_workspace_size)
-            workspace_ptr = self.workspace_buffer.ptr
-            err, = cuda.cuMemsetD32(
-                workspace_ptr, 0, device_workspace_size // 4)
-        else:
-            workspace_ptr = None
-
-        # get kernel params as bytearray
-        semaphore = 0
-        if workspace_ptr is not None and \
-            self.split_k_mode == cutlass.conv.SplitKMode.Parallel:
-            self.ptr_D = workspace_ptr
-        elif workspace_ptr is not None and \
-            self.split_k_mode == cutlass.conv.SplitKMode.Serial:
-            semaphore = workspace_ptr
-
-        self.get_arguments(semaphore)
-
-        params_ = self.operation.rt_module.get_args(ctypes.byref(
-            self.c_arguments), ctypes.c_void_p(int(self.semaphore)))
-        self.host_workspace = bytearray(params_.contents)
-        self.device_workspace = None
-
-    def sync(self):
-        """
-        Synchronize the arguments. If the input tensor is in host, 
-        copy it from device to host.
-        """
-        return super().sync()
-
-
-# @typechecked
-class Conv2dRT(ExecutableOperation):
-    """
-    Conv2dRT manages the CUTLASS runtime components
-    """
-    KernelTemplate = r'''
-extern "C"
-__global__ void
-${operation_name}(${operation_name}${operation_suffix}::Params params) {
-
-  // Dynamic shared memory base pointer
-  extern __shared__ int SharedStorageBase[];
-
-  // Declare pointer to dynamic shared memory.
-  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
-      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
-
-  ${operation_name}${operation_suffix} op;
-
-  op(params, *shared_storage);
-}
-    '''
-
-    HostTemplate = r'''
-extern "C" {
-  // Get the size of params in bytes
-  int ${operation_name}_get_param_size(){
-    return sizeof(${operation_name}${operation_suffix}::Params);
-  }
-
-  // Get the size of dynamic shared memory in bytes
-  int ${operation_name}_shared_memory_size() {
-    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
-  }
-
-  // Get the params as byte array
-  char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Arguments* arguments, int *semaphore=nullptr){
-    typename ${operation_name}${operation_suffix}::Params* params;
-    params = new ${operation_name}${operation_suffix}::Params(*arguments, semaphore);
-
-    char *bytes = ((char*)(params));
-    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
-    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
-        output[i] = bytes[i];
-
-    return output;
-  }
-}
-
-    '''
-
-    def __init__(self, operation: 'Conv2dOperation'):
-        super().__init__(operation)
-        self.argument_type, self.epilogue_type = get_conv2d_arguments(operation.epilogue_functor)
-        self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_void_p]
-        self.conv_kind = operation.conv_kind
-
-        self.operation: Conv2dOperation = operation
-
-        self.emitter = EmitConv2dInstance('_type')
-
-        self.threads: int = operation.tile_description.num_threads
-
-        self.swizzle_functor = operation.swizzling_functor
-
-    def emit(self):
-        return self.emitter.emit(self.operation)
-
-    # @typechecked
-    def get_device_workspace_size(self, arguments: Conv2dArguments):
-        workspace_bytes = 0
-
-        launch_config = arguments.launch_config
-
-        self.conv_kind = self.operation.conv_kind
-
-        if arguments.split_k_mode == cutlass.conv.SplitKMode.Parallel:
-            problem_size = arguments.problem_size
-            workspace_bytes = DataTypeSize[self.operation.C.element] \
-            * launch_config.grid[2] * cutlass.conv.implicit_gemm_tensor_c_size(
-                self.conv_kind, problem_size
-            ) // 8
-        elif arguments.split_k_mode == cutlass.conv.SplitKMode.Serial and \
-            arguments.split_k_slices > 1:
-            workspace_bytes = launch_config.grid[0] * launch_config.grid[1] * 4
-
-        return workspace_bytes
-
-    # @typechecked
-    def plan(self, arguments: Conv2dArguments):
-        tile_size = cutlass.gemm.GemmCoord(
-            self.operation.tile_description.threadblock_shape[0],
-            self.operation.tile_description.threadblock_shape[1],
-            self.operation.tile_description.threadblock_shape[2]
-        )
-
-        grid = self.swizzle_functor.get_grid_shape(
-            self.swizzle_functor.get_tiled_shape(
-                self.conv_kind, arguments.problem_size, 
-                tile_size, arguments.split_k_slices
-            )
-        )
-        return LaunchConfiguration(
-            [grid.x, grid.y, grid.z], [self.threads, 1, 1], 
-            self.shared_memory_capacity)
-
-    def initialize(self):
-        err, = cuda.cuFuncSetAttribute(
-            self.kernel,
-            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            value=self.shared_memory_capacity)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('Cuda Error: {}'.format(err))
-
-#
-
-
-class Conv2dOperation:
-    """
-    CUTLASS Conv2d operation description.
-
-    :param conv_kind: convolution operator
-    :type conv_kind: :class:`cutlass.conv.Operator`
-
-    :param iterator_algorithm: Selects among several implementation 
-    variants trading off performance with simplicity
-    :type iterator_algorithm: :class:`cutlass.conv.IteratorAlgorithm`
-
-    :param arch: GPU compute capability (sm_xx)
-    :type arch: int
-
-    :param tile_description: tile description
-    :type tile_description: :class:`pycutlass.TileDescription`
-
-    :param A: tensor A description
-    :type A: :class:`pycutlass.TensorDescription`
-
-    :param B: tensor B description
-    :type B: :class:`pycutlass.TensorDescription`
-
-    :param C: tensor C description
-    :type C: :class:`pycutlass.TensorDescription`
-
-    :param D: tensor D description
-    :type D: :class:`pycutlass.TensorDescription`
-
-    :param element_epilogue: element type for computation in epilogue \
-    :type element_epilogue: cutlass.int8 | cutlass.int32 | cutlass.float16 | \
-    cutlass.bfloat16 | cutlass.float32 | cutlass.float64
-
-    :param stride_support: distinguish among partial specializations that \
-    accelerate certain problems where convolution stride is unit \
-    :type stride_support: :class:`cutlass.conv.StrideSupport`
-
-    :param epilogue_functor: convolution epilogue functor
-    :type epilogue_functor: :class:`EpilogueFunctor`
-
-    :param swizzling_functor: threadblock swizzling functor
-    """
-    #
-
-    def __init__(self,
-                 conv_kind: cutlass.conv.Operator,
-                 iterator_algorithm: cutlass.conv.IteratorAlgorithm,
-                 arch: int, tile_description: TileDescription,
-                 A: TensorDescription, B: TensorDescription, C: TensorDescription,
-                 stride_support, epilogue_functor,
-                 swizzling_functor=cutlass.IdentitySwizzle1):
-
-        self.operation_kind: OperationKind = OperationKind.Conv2d
-        self.arch: int = arch
-        self.tile_description: TileDescription = tile_description
-        self.conv_kind = conv_kind
-        self.A: TensorDescription = A
-        self.B: TensorDescription = B
-        self.C: TensorDescription = C
-        self.epilogue_functor = epilogue_functor
-        self.iterator_algorithm = iterator_algorithm
-        self.stride_support = stride_support
-        self.swizzling_functor = swizzling_functor()
-
-        self.rt_module: Conv2dRT = Conv2dRT(self)
-        self.argument_type = self.rt_module.argument_type
-        self.epilogue_type = self.rt_module.epilogue_type
-
-    def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
-        """
-        Launch the cuda kernel with input arguments
-
-        :param arguments: conv2d arguments
-        :type arguments: :class:`pycutlass.Conv2dArguments`
-        """
-
-        # launch the kernel
-        err = self.rt_module.run(
-            arguments.host_workspace,
-            arguments.device_workspace,
-            arguments.launch_config)
-
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('CUDA Error %s' % str(err))
-
-        return err
-
-    #
-    # Get function name
-    #
-
-    def procedural_name(self):
-        ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
-        return self.configuration_name()
-    #
-
-    def configuration_name(self):
-        ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
-
-        opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-
-        threadblock = "%dx%d_%dx%d" % (
-            self.tile_description.threadblock_shape[0],
-            self.tile_description.threadblock_shape[1],
-            self.tile_description.threadblock_shape[2],
-            self.tile_description.stages
-        )
-
-        if self.stride_support == StrideSupport.Unity:
-            configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
-        else:
-            configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
-
-        return SubstituteTemplate(
-            configuration_name,
-            {
-                'arch': str(self.arch),
-                'opcode_class': opcode_class_name,
-                'extended_name': self.extended_name(),
-                'threadblock': threadblock,
-                'layout': self.layout_name(),
-                'alignment': "%d" % self.A.alignment,
-            }
-        )
-
-    #
-    def extended_name(self):
-        ''' Append data types if they differ from compute type. '''
-        if self.C.element != self.tile_description.math_instruction.element_accumulator and \
-                self.A.element != self.tile_description.math_instruction.element_accumulator:
-            extended_name = "${element_c}_${core_name}_${element_a}"
-        elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
-                self.A.element != self.tile_description.math_instruction.element_accumulator:
-            extended_name = "${core_name}_${element_a}"
-        else:
-            extended_name = "${core_name}"
-
-        extended_name = SubstituteTemplate(extended_name, {
-            'element_a': DataTypeNames[self.A.element],
-            'element_c': DataTypeNames[self.C.element],
-            'core_name': self.core_name()
-        })
-
-        return extended_name
-
-    #
-    def layout_name(self):
-        return "%s" % (ShortLayoutTypeNames[self.A.layout])
-
-    #
-    def core_name(self):
-        ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-
-        intermediate_type = ''
-
-        if self.tile_description.math_instruction.opcode_class == cutlass.OpClass.TensorOp:
-            inst_shape = "%dx%dx%d" % tuple(
-                self.tile_description.math_instruction.instruction_shape)
-            if self.tile_description.math_instruction.element_a != self.A.element and \
-                    self.tile_description.math_instruction.element_a != self.accumulator_type():
-                intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
-        else:
-            inst_shape = ''
-
-        return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()],
-                                inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
-
-    #
-    def is_complex(self):
-        complex_operators = [
-            MathOperation.multiply_add_complex,
-            MathOperation.multiply_add_complex_gaussian
-        ]
-        return self.tile_description.math_instruction.math_operation in complex_operators
-
-    #
-    def accumulator_type(self):
-        accum = self.tile_description.math_instruction.element_accumulator
-
-        if self.is_complex():
-            return get_complex_from_real(accum)
-
-        return accum
-
-
-###################################################################################################
-#
-# Emits single instances of a CUTLASS device-wide operator
-#
-###################################################################################################
-
-class EmitConv2dInstance:
-    def __init__(self, operation_suffix=''):
-        self.operation_suffix = operation_suffix
-        self.includes = [
-            "cutlass/cutlass.h",
-            "cutlass/conv/kernel/default_conv2d_fprop.h",
-            "cutlass/conv/kernel/default_conv2d_dgrad.h",
-            "cutlass/conv/kernel/default_conv2d_wgrad.h"
-        ]
-        self.template = """
-// Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
-using ${operation_name}_base = 
-typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
-  ${element_a}, 
-  ${layout_a},
-  ${element_b}, 
-  ${layout_b},
-  ${element_c}, 
-  ${layout_c},
-  ${element_accumulator},
-  ${opcode_class},
-  ${arch},
-  cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
-  cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
-  cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
-  ${epilogue_functor},
-  ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
-  ${stages},
-  ${math_operator},
-  ${iterator_algorithm},
-  ${stride_support},
-  ${align_a},
-  ${align_b}
->::Kernel;
-
-struct ${operation_name}${operation_suffix}:
-  public ${operation_name}_base { };
-
-"""
-
-    def emit(self, operation):
-
-        warp_shape = [int(operation.tile_description.threadblock_shape[idx] /
-                          operation.tile_description.warp_count[idx]) for idx in range(3)]
-
-        epilogue_vector_length = int(min(
-            operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
-
-        values = {
-            'operation_name': operation.procedural_name(),
-            'operation_suffix': self.operation_suffix,
-            'conv_kind': ConvKindTag[operation.conv_kind],
-            'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
-            'element_a': DataTypeTag[operation.A.element],
-            'layout_a': LayoutTag[operation.A.layout],
-            'element_b': DataTypeTag[operation.B.element],
-            'layout_b': LayoutTag[operation.B.layout],
-            'element_c': DataTypeTag[operation.C.element],
-            'layout_c': LayoutTag[operation.C.layout],
-            'element_accumulator': DataTypeTag[operation.accumulator_type()],
-            'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
-            'arch': "cutlass::arch::Sm%d" % operation.arch,
-            'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
-            'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
-            'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
-            'warp_shape_m': str(warp_shape[0]),
-            'warp_shape_n': str(warp_shape[1]),
-            'warp_shape_k': str(warp_shape[2]),
-            'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
-            'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
-            'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
-            'epilogue_vector_length': str(epilogue_vector_length),
-            'epilogue_functor': operation.epilogue_functor.emit(),
-            'swizzling_functor': operation.swizzling_functor.tag(),
-            'stages': str(operation.tile_description.stages),
-            'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
-            'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
-            'stride_support': StrideSupportTag[operation.stride_support],
-            'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else
-            MathOperationTag[operation.tile_description.math_instruction.math_operation],
-            'align_a': str(operation.A.alignment),
-            'align_b': str(operation.B.alignment),
-        }
-
-        return SubstituteTemplate(self.template, values)
--- a/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py
--- a/tools/library/scripts/pycutlass/src/pycutlass/frontend.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/frontend.py
@ -1,104 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-
-import numpy as np
-from cuda import cuda
-from pycutlass.memory_manager import *
-from typing import TYPE_CHECKING
-try:
-    import torch
-    torch_available = True
-except ImportError:
-    torch_available = False
-    if TYPE_CHECKING:
-        import torch
-
-try:
-    import cupy as cp
-    cupy_available = True
-except ImportError:
-    cupy_available = False
-    if TYPE_CHECKING:
-        import cupy as cp
-
-
-class NumpyFrontend:
-    """
-    Frontend node for numpy
-    """
-
-    @staticmethod
-    def argument(np_tensor: 'np.ndarray', is_output: 'bool') -> cuda.CUdeviceptr:
-        """Convert the input numpy tensor to CUDA device pointer
-
-        :param np_tensor: input numpy nd array
-        :param is_output: whether the tensor is output
-
-        :return: CUDA device pointer
-        """
-        # copy the data to device
-        if is_output:
-            return device_mem_alloc(np_tensor.size * np_tensor.itemsize)
-        else:
-            return todevice(np_tensor)
-
-
-class TorchFrontend:
-    """
-    Frontend node for torch
-    """
-
-    @staticmethod
-    def argument(torch_tensor: 'torch.Tensor') -> cuda.CUdeviceptr:
-        """Convert the input torch tensor to CUDA device pointer
-
-        :param torch_tensor: input torch tensor
-        :param is_output: whether the tensor is output
-
-        :return: CUDA device pointer
-        """
-
-        # check the device of torch_tensor
-        if not torch_tensor.is_cuda:
-            torch_tensor = torch_tensor.to("cuda")
-
-        return cuda.CUdeviceptr(torch_tensor.data_ptr())
-
-
-class CupyFrontend:
-    """
-    Frontend node for cupy
-    """
-
-    @staticmethod
-    def argument(cupy_ndarray: 'cp.ndarray'):
-        return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))
--- a/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py
--- a/tools/library/scripts/pycutlass/src/pycutlass/library.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/library.py
@ -1,870 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import re
-
-###################################################################################################
-
-import enum
-import cutlass
-import cute
-
-# The following block implements enum.auto() for Python 3.5 variants that don't include it such
-# as the default 3.5.2 on Ubuntu 16.04.
-#
-# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
-
-try:
-    from enum import auto as enum_auto
-except ImportError:
-    __cutlass_library_auto_enum = 0
-
-    def enum_auto() -> int:
-        global __cutlass_library_auto_enum
-        i = __cutlass_library_auto_enum
-        __cutlass_library_auto_enum += 1
-        return i
-
-###################################################################################################
-
-#
-
-
-class GeneratorTarget(enum.Enum):
-    Library = enum_auto()
-
-#
-GeneratorTargetNames = {
-    GeneratorTarget.Library: 'library', 
-}
-#
-
-###################################################################################################
-
-#
-ShortDataTypeNames = {
-    cutlass.int32: 'i',
-    cutlass.float16: 'h',
-    cutlass.float32: 's',
-    cutlass.float64: 'd',
-    cutlass.dtype.cf32: 'c',
-    cutlass.dtype.cf64: 'z',
-}
-
-#
-DataTypeNames = {
-    cutlass.dtype.b1: "b1",
-    cutlass.dtype.u4: "u4",
-    cutlass.dtype.u8: "u8",
-    cutlass.dtype.u16: "u16",
-    cutlass.dtype.u32: "u32",
-    cutlass.dtype.u64: "u64",
-    cutlass.dtype.s4: "s4",
-    cutlass.int8: "s8",
-    cutlass.dtype.s16: "s16",
-    cutlass.int32: "s32",
-    cutlass.dtype.s64: "s64",
-    cutlass.float16: "f16",
-    cutlass.bfloat16: "bf16",
-    cutlass.float32: "f32",
-    cutlass.tfloat32: "tf32",
-    cutlass.float64: "f64",
-    cutlass.dtype.cf16: "cf16",
-    cutlass.dtype.cbf16: "cbf16",
-    cutlass.dtype.cf32: "cf32",
-    cutlass.dtype.ctf32: "ctf32",
-    cutlass.dtype.cf64: "cf64",
-    cutlass.dtype.cu4: "cu4",
-    cutlass.dtype.cu8: "cu8",
-    cutlass.dtype.cu16: "cu16",
-    cutlass.dtype.cu32: "cu32",
-    cutlass.dtype.cu64: "cu64",
-    cutlass.dtype.cs4: "cs4",
-    cutlass.dtype.cs8: "cs8",
-    cutlass.dtype.cs16: "cs16",
-    cutlass.dtype.cs32: "cs32",
-    cutlass.dtype.cs64: "cs64",
-}
-
-DataTypeTag = {
-    cutlass.dtype.b1: "cutlass::uint1b_t",
-    cutlass.dtype.u4: "cutlass::uint4b_t",
-    cutlass.dtype.u8: "uint8_t",
-    cutlass.dtype.u16: "uint16_t",
-    cutlass.dtype.u32: "uint32_t",
-    cutlass.dtype.u64: "uint64_t",
-    cutlass.dtype.s4: "cutlass::int4b_t",
-    cutlass.int8: "int8_t",
-    cutlass.dtype.s16: "int16_t",
-    cutlass.int32: "int32_t",
-    cutlass.dtype.s64: "int64_t",
-    cutlass.float16: "cutlass::half_t",
-    cutlass.bfloat16: "cutlass::bfloat16_t",
-    cutlass.float32: "float",
-    cutlass.tfloat32: "cutlass::tfloat32_t",
-    cutlass.float64: "double",
-    cutlass.dtype.cf16: "cutlass::complex<cutlass::half_t>",
-    cutlass.dtype.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
-    cutlass.dtype.cf32: "cutlass::complex<float>",
-    cutlass.dtype.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
-    cutlass.dtype.cf64: "cutlass::complex<double>",
-    cutlass.dtype.cu4: "cutlass::complex<cutlass::uint4b_t>",
-    cutlass.dtype.cu8: "cutlass::complex<cutlass::uint8_t>",
-    cutlass.dtype.cu16: "cutlass::complex<cutlass::uint16_t>",
-    cutlass.dtype.cu32: "cutlass::complex<cutlass::uint32_t>",
-    cutlass.dtype.cu64: "cutlass::complex<cutlass::uint64_t>",
-    cutlass.dtype.cs4: "cutlass::complex<cutlass::int4b_t>",
-    cutlass.dtype.cs8: "cutlass::complex<cutlass::int8_t>",
-    cutlass.dtype.cs16: "cutlass::complex<cutlass::int16_t>",
-    cutlass.dtype.cs32: "cutlass::complex<cutlass::int32_t>",
-    cutlass.dtype.cs64: "cutlass::complex<cutlass::int64_t>",
-}
-
-DataTypeSize = {
-    cutlass.dtype.b1: 1,
-    cutlass.dtype.u4: 4,
-    cutlass.dtype.u8: 8,
-    cutlass.dtype.u16: 16,
-    cutlass.dtype.u32: 32,
-    cutlass.dtype.u64: 64,
-    cutlass.dtype.s4: 4,
-    cutlass.int8: 8,
-    cutlass.dtype.s16: 16,
-    cutlass.int32: 32,
-    cutlass.dtype.s64: 64,
-    cutlass.float16: 16,
-    cutlass.bfloat16: 16,
-    cutlass.float32: 32,
-    cutlass.tfloat32: 32,
-    cutlass.float64: 64,
-    cutlass.dtype.cf16: 32,
-    cutlass.dtype.cbf16: 32,
-    cutlass.dtype.cf32: 64,
-    cutlass.dtype.ctf32: 32,
-    cutlass.dtype.cf64: 128,
-    cutlass.dtype.cu4: 8,
-    cutlass.dtype.cu8: 16,
-    cutlass.dtype.cu16: 32,
-    cutlass.dtype.cu32: 64,
-    cutlass.dtype.cu64: 128,
-    cutlass.dtype.cs4: 8,
-    cutlass.dtype.cs8: 16,
-    cutlass.dtype.cs16: 32,
-    cutlass.dtype.cs32: 64,
-    cutlass.dtype.cs64: 128,
-}
-
-
-class DataTypeSizeBytes:
-    """
-    Static class to mimic the `DataTypeSize` dictionary, but with checks for whether the
-    data type key is less than a full byte or a non-integer number of bytes.
-    """
-    @staticmethod
-    def __class_getitem__(datatype):
-        """
-        Returns the number of bytes in size the data type is. Raises an exception if the data type
-        is either less than a full byte or a non-integer number of bytes in size.
-
-        :param datatype: data type to query
-
-        :return: number of bytes the data type occupies
-        :rtype: int
-        """
-        bits = DataTypeSize[datatype]
-        if bits < 8:
-            raise Exception('Data type {} is less than one byte in size.'.format(datatype))
-        elif bits % 8 != 0:
-            raise Exception('Data type {} is not an integer number of bytes.'.format(datatype))
-        return bits // 8
-
-###################################################################################################
-#
-
-
-class BlasMode(enum.Enum):
-    symmetric = enum_auto()
-    hermitian = enum_auto()
-
-
-#
-BlasModeTag = {
-    BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
-    BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
-}
-
-#
-ComplexTransformTag = {
-    cutlass.complex_transform.none: 'cutlass::ComplexTransform::kNone',
-    cutlass.complex_transform.conj: 'cutlass::ComplexTransform::kConjugate',
-}
-
-#
-RealComplexBijection = [
-    (cutlass.float16, cutlass.dtype.cf16),
-    (cutlass.float32, cutlass.dtype.cf32),
-    (cutlass.float64, cutlass.dtype.cf64),
-]
-
-#
-
-
-def is_complex(data_type):
-    for r, c in RealComplexBijection:
-        if data_type == c:
-            return True
-    return False
-
-#
-
-
-def get_complex_from_real(real_type):
-    for r, c in RealComplexBijection:
-        if real_type == r:
-            return c
-    return cutlass.dtype.invalid
-
-#
-
-
-def get_real_from_complex(complex_type):
-    for r, c in RealComplexBijection:
-        if complex_type == c:
-            return r
-    return cutlass.dtype.invalid
-
-#
-
-
-class ComplexMultiplyOp(enum.Enum):
-    multiply_add = enum_auto()
-    gaussian = enum_auto()
-
-###################################################################################################
-
-#
-
-
-class MathOperation(enum.Enum):
-    multiply_add = enum_auto()
-    multiply_add_saturate = enum_auto()
-    xor_popc = enum_auto()
-    multiply_add_fast_bf16 = enum_auto()
-    multiply_add_fast_f16 = enum_auto()
-    multiply_add_fast_f32 = enum_auto()
-    multiply_add_complex_fast_f32 = enum_auto()
-    multiply_add_complex = enum_auto()
-    multiply_add_complex_gaussian = enum_auto()
-
-
-#
-MathOperationNames = {
-    MathOperation.multiply_add: 'multiply_add',
-    MathOperation.multiply_add_saturate: 'multiply_add_saturate',
-    MathOperation.xor_popc: 'xor_popc',
-    MathOperation.multiply_add_fast_bf16: 'multiply_add_fast_bf16',
-    MathOperation.multiply_add_fast_f16: 'multiply_add_fast_f16',
-    MathOperation.multiply_add_fast_f32: 'multiply_add_fast_f32',
-    MathOperation.multiply_add_complex_fast_f32: 'multiply_add_complex_fast_f32',
-    MathOperation.multiply_add_complex: 'multiply_add_complex',
-    MathOperation.multiply_add_complex_gaussian: 'multiply_add_complex_gaussian',
-}
-
-#
-MathOperationTag = {
-    MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
-    MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
-    MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
-    MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
-    MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
-    MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
-    MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
-    MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
-    MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
-}
-
-###################################################################################################
-
-#
-LayoutTag = {
-    cutlass.ColumnMajor: 'cutlass::layout::ColumnMajor',
-    cutlass.RowMajor: 'cutlass::layout::RowMajor',
-    cutlass.layout.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
-    cutlass.layout.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
-    cutlass.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
-    cutlass.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
-    cutlass.layout.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
-    cutlass.layout.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
-    cutlass.TensorNHWC: 'cutlass::layout::TensorNHWC',
-    cutlass.layout.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
-    cutlass.layout.TensorNCHW: 'cutlass::layout::TensorNCHW',
-    cutlass.layout.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
-    cutlass.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
-    cutlass.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
-    cutlass.layout.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
-    cutlass.layout.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
-}
-
-#
-TransposedLayout = {
-    cutlass.ColumnMajor: cutlass.RowMajor,
-    cutlass.RowMajor: cutlass.ColumnMajor,
-    cutlass.layout.ColumnMajorInterleaved2: cutlass.layout.RowMajorInterleaved2,
-    cutlass.layout.RowMajorInterleaved2: cutlass.layout.ColumnMajorInterleaved2,
-    cutlass.ColumnMajorInterleaved32: cutlass.RowMajorInterleaved32,
-    cutlass.RowMajorInterleaved32: cutlass.ColumnMajorInterleaved32,
-    cutlass.layout.ColumnMajorInterleaved64: cutlass.layout.RowMajorInterleaved64,
-    cutlass.layout.RowMajorInterleaved64: cutlass.layout.ColumnMajorInterleaved64,
-    cutlass.TensorNHWC: cutlass.TensorNHWC
-}
-
-#
-ShortLayoutTypeNames = {
-    cutlass.ColumnMajor: 'n',
-    cutlass.layout.ColumnMajorInterleaved2: 'n2',
-    cutlass.ColumnMajorInterleaved32: 'n32',
-    cutlass.layout.ColumnMajorInterleaved64: 'n64',
-    cutlass.RowMajor: 't',
-    cutlass.layout.RowMajorInterleaved2: 't2',
-    cutlass.RowMajorInterleaved32: 't32',
-    cutlass.layout.RowMajorInterleaved64: 't64',
-    cutlass.TensorNHWC: 'nhwc',
-    cutlass.layout.TensorNDHWC: 'ndhwc',
-    cutlass.layout.TensorNCHW: 'nchw',
-    cutlass.layout.TensorNGHWC: 'nghwc',
-    cutlass.TensorNC32HW32: 'nc32hw32',
-    cutlass.layout.TensorNC64HW64: 'nc64hw64',
-    cutlass.TensorC32RSK32: 'c32rsk32',
-    cutlass.layout.TensorC64RSK64: 'c64rsk64'
-}
-
-#
-ShortComplexLayoutNames = {
-    (cutlass.ColumnMajor, cutlass.complex_transform.none): 'n',
-    (cutlass.ColumnMajor, cutlass.complex_transform.conj): 'c',
-    (cutlass.RowMajor, cutlass.complex_transform.none): 't',
-    (cutlass.RowMajor, cutlass.complex_transform.conj): 'h'
-}
-
-#
-CuTeLayoutTag = {
-    cute.GMMAMajor.K: 'cute::GMMA::Major::K',
-    cute.GMMAMajor.MN: 'cute::GMMA::Major::MN'
-}
-
-###################################################################################################
-
-#
-
-
-class SideMode(enum.Enum):
-    Left = enum_auto()
-    Right = enum_auto()
-
-
-#
-SideModeTag = {
-    SideMode.Left: 'cutlass::SideMode::kLeft',
-    SideMode.Right: 'cutlass::SideMode::kRight'
-}
-
-#
-ShortSideModeNames = {
-    SideMode.Left: 'ls',
-    SideMode.Right: 'rs'
-}
-
-###################################################################################################
-
-#
-
-
-class FillMode(enum.Enum):
-    Lower = enum_auto()
-    Upper = enum_auto()
-
-
-#
-FillModeTag = {
-    FillMode.Lower: 'cutlass::FillMode::kLower',
-    FillMode.Upper: 'cutlass::FillMode::kUpper'
-}
-
-#
-ShortFillModeNames = {
-    FillMode.Lower: 'l',
-    FillMode.Upper: 'u'
-}
-
-###################################################################################################
-
-#
-
-
-class DiagType(enum.Enum):
-    NonUnit = enum_auto()
-    Unit = enum_auto()
-
-
-#
-DiagTypeTag = {
-    DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
-    DiagType.Unit: 'cutlass::DiagType::kUnit'
-}
-
-#
-ShortDiagTypeNames = {
-    DiagType.NonUnit: 'nu',
-    DiagType.Unit: 'un'
-}
-
-###################################################################################################
-
-OpcodeClassNames = {
-    cutlass.OpClass.Simt: 'simt',
-    cutlass.OpClass.TensorOp: 'tensorop',
-    cutlass.OpClass.WmmaTensorOp: 'wmma_tensorop',
-    cutlass.OpClass.SparseTensorOp: 'sptensorop'
-}
-
-OpcodeClassTag = {
-    cutlass.OpClass.Simt: 'cutlass::arch::OpClassSimt',
-    cutlass.OpClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
-    cutlass.OpClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
-    cutlass.OpClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp'
-}
-
-###################################################################################################
-
-#
-
-class OperationKind(enum.Enum):
-    Gemm = enum_auto()
-    RankK = enum_auto()
-    Rank2K = enum_auto()
-    Trmm = enum_auto()
-    Symm = enum_auto()
-    Conv2d = enum_auto()
-    Conv3d = enum_auto()
-
-
-#
-OperationKindNames = {
-    OperationKind.Gemm: 'gemm', OperationKind.RankK: 'rank_k', OperationKind.Rank2K: 'rank_2k', OperationKind.Trmm: 'trmm', OperationKind.Symm: 'symm', OperationKind.Conv2d: 'conv2d', OperationKind.Conv3d: 'conv3d'
-}
-
-#
-ArchitectureNames = {
-    50: 'maxwell',
-    60: 'pascal',
-    61: 'pascal',
-    70: 'volta',
-    75: 'turing',
-    80: 'ampere',
-    90: 'hopper'
-}
-
-#
-SharedMemPerCC = {
-    70: 96 << 10,   # 96KB of SMEM
-    72: 96 << 10,   # 96KB of SMEM
-    75: 64 << 10,   # 64KB of SMEM
-    80: 160 << 10,  # 164KB of SMEM - 4KB reserved for the driver
-    86: 100 << 10,  # 100KB of SMEM
-    87: 160 << 10,  # 164KB of SMEM - 4KB reserved for the driver
-    89: 100 << 10,  # 100KB of SMEM
-    90: 227 << 10,  # 228KB of SMEM - 1KB reserved for the driver
-}
-
-###################################################################################################
-
-class GemmKind(enum.Enum):
-    Gemm = enum_auto()
-    Sparse = enum_auto()
-    Universal = enum_auto()
-    PlanarComplex = enum_auto()
-    PlanarComplexArray = enum_auto()
-    Grouped = enum_auto()
-
-
-#
-GemmKindNames = {
-    GemmKind.Gemm: "gemm",
-    GemmKind.Sparse: "spgemm",
-    GemmKind.Universal: "gemm",
-    GemmKind.PlanarComplex: "gemm_planar_complex",
-    GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
-    GemmKind.Grouped: "gemm_grouped"
-}
-
-#
-
-
-class RankKKind(enum.Enum):
-    Universal = enum_auto()
-
-
-#
-RankKKindNames = {
-    RankKKind.Universal: "rank_k"
-}
-
-#
-
-
-class TrmmKind(enum.Enum):
-    Universal = enum_auto()
-
-
-#
-TrmmKindNames = {
-    TrmmKind.Universal: "trmm"
-}
-
-#
-
-
-class SymmKind(enum.Enum):
-    Universal = enum_auto()
-
-
-#
-SymmKindNames = {
-    SymmKind.Universal: "symm"
-}
-
-#
-
-
-class SwizzlingFunctor(enum.Enum):
-    Identity1 = enum_auto()
-    Identity2 = enum_auto()
-    Identity4 = enum_auto()
-    Identity8 = enum_auto()
-    Horizontal = enum_auto()
-    BatchedIdentity1 = enum_auto()
-    StridedDgradIdentity1 = enum_auto()
-    StridedDgradIdentity4 = enum_auto()
-    StridedDgradHorizontal = enum_auto()
-
-
-#
-SwizzlingFunctorTag = {
-    cutlass.IdentitySwizzle1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
-    SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
-    SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
-    SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
-    SwizzlingFunctor.Horizontal: 'cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle',
-    SwizzlingFunctor.BatchedIdentity1: "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle",
-    SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
-    SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
-    SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
-}
-
-#
-
-
-class SchedulerMode(enum.Enum):
-    Device = enum_auto(),
-    Host = enum_auto()
-
-
-#
-SchedulerModeTag = {
-    SchedulerMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
-    SchedulerMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
-}
-
-#
-ShortSchedulerModeNames = {
-    SchedulerMode.Device: 'Device',
-    SchedulerMode.Host: 'Host'
-}
-
-###################################################################################################
-
-
-#
-ConvKindTag = {
-    cutlass.conv.Operator.fprop: 'cutlass::conv::Operator::kFprop',
-    cutlass.conv.Operator.dgrad: 'cutlass::conv::Operator::kDgrad',
-    cutlass.conv.Operator.wgrad: 'cutlass::conv::Operator::kWgrad'
-}
-
-ConvKindNames = {
-    cutlass.conv.Operator.fprop: 'fprop',
-    cutlass.conv.Operator.dgrad: 'dgrad',
-    cutlass.conv.Operator.wgrad: 'wgrad',
-}
-
-
-#
-IteratorAlgorithmTag = {
-    cutlass.conv.IteratorAlgorithm.analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
-    cutlass.conv.IteratorAlgorithm.optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
-    cutlass.conv.IteratorAlgorithm.fixed_channels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
-    cutlass.conv.IteratorAlgorithm.few_channels: 'cutlass::conv::IteratorAlgorithm::kFewChannels'
-}
-
-IteratorAlgorithmNames = {
-    cutlass.conv.IteratorAlgorithm.analytic: 'analytic',
-    cutlass.conv.IteratorAlgorithm.optimized: 'optimized',
-    cutlass.conv.IteratorAlgorithm.fixed_channels: 'fixed_channels',
-    cutlass.conv.IteratorAlgorithm.few_channels: 'few_channels'
-}
-
-#
-
-
-class StrideSupport(enum.Enum):
-    Strided = enum_auto()
-    Unity = enum_auto()
-
-
-#
-StrideSupportTag = {
-    StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
-    StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
-}
-
-StrideSupportNames = {
-    StrideSupport.Strided: '',
-    StrideSupport.Unity: 'unity_stride',
-}
-
-
-class ConvMode(enum.Enum):
-    CrossCorrelation = enum_auto()
-    Convolution = enum_auto()
-
-
-#
-ConvModeTag = {
-    ConvMode.CrossCorrelation: 'cutlass::conv::Mode::kCrossCorrelation',
-    ConvMode.Convolution: 'cutlass::conv::Mode::kConvolution'
-}
-
-###################################################################################################
-
-#
-
-
-class MathInstruction:
-    """
-    Description of a the lowest-level matrix-multiply-accumulate operation to be used in a kernel
-    """
-    def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class=cutlass.OpClass.Simt, math_operation=MathOperation.multiply_add):
-        """
-        :param instruction_shape: size of the [M, N, K] dimensions of the instruction
-        :type instruction_shape: list or tuple
-        :param element_a: data type of operand A
-        :param element_b: data type of operand B
-        :param element_accumulator: data type used in accumulation
-        :param opcode_class: higher-level class of the instruction (e.g., SIMT or Tensor Core)
-        :type opcode_class: cutlass.OpClass
-        :param math_operation: the type of low-level operation to be performed (e.g., multiply accumulate)
-        :type math_operation: MathOperation
-        """
-        self.instruction_shape = instruction_shape
-        self.element_a = element_a
-        self.element_b = element_b
-        self.element_accumulator = element_accumulator
-        self.opcode_class = opcode_class
-        self.math_operation = math_operation
-
-#
-
-
-class TileDescription:
-    """
-    Description of a tile of computation to be performed in the kernel, encompassing threadblock, cluster, and warp shapes,
-    stage count, and math instruction specification
-    """
-    def __init__(self, threadblock_shape, stages, warp_count, math_instruction, cluster_shape=[1, 1, 1], persistent=False):
-        """
-        :param threadblock_shape: shape of a threadblock tyle
-        :type threadblock_shape: list or tuple
-        :param stages: number of pipeline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
-                       number of stages that can be supported for an operation on a given architecture will be computed at a later time
-        :type stages: int or None
-        :param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile
-        :type warp_count: list, tuple, or None
-        :param math_instruction: specification of the instruction type and shape to be performed and the types of its operands
-        :type math_instruction: MathInstruction
-        :param cluster_shape: number of threadblocks in the [X, Y, Z] dimensions of a threadblock cluster
-        :param persistent: whether the kernel uses persistent warp-specialized threadblocks (only available for SM90+)
-        :type persistent: bool
-        """
-        self.threadblock_shape = threadblock_shape
-        self.cluster_shape = cluster_shape
-        self.persistent: bool = persistent
-        self.stages: int = stages
-
-        self.math_instruction = math_instruction
-
-        # Number of warps along x, y, z directions
-        self.warp_count = warp_count
-
-    @property
-    def num_threads(self):
-        """
-        Returns the number of threads in the threadblock
-
-        :return: number of threads in the threadblock
-        :rtype: int or None (if warp count is None)
-        """
-        if self.warp_count is not None:
-            threads = 32
-            for cnt in self.warp_count:
-                threads *= cnt
-            return threads
-        return None
-
-    def procedural_name(self):
-        """
-        Returns a name identifying the tile description
-
-        :return: name identifying the tile description
-        :rtype: int
-        """
-        emit_stages = 0 if self.stages is None else self.stages
-        name = "%dx%dx%d_%dx%d_%dx%d" % (
-            self.cluster_shape[0], self.cluster_shape[1], self.cluster_shape[2],
-            self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], emit_stages)
-
-        if self.persistent:
-            name += '_persistent'
-        return name
-
-#
-
-
-class TensorDescription:
-    def __init__(self, element, layout, alignment=1, complex_transform=cutlass.complex_transform.none):
-        self.element = element
-        self.layout = layout
-        self.alignment = min(128 // DataTypeSize[self.element], alignment)
-        self.complex_transform = complex_transform
-
-#
-
-
-class SymmetricTensorDescription:
-    def __init__(self, element, layout, fill_mode, alignment=1, complex_transform=cutlass.complex_transform.none, side_mode=SideMode.Left):
-        self.element = element
-        self.layout = layout
-        self.fill_mode = fill_mode
-        self.alignment = alignment
-        self.complex_transform = complex_transform
-        self.side_mode = side_mode
-
-#
-
-
-class TriangularTensorDescription:
-    def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment=1, complex_transform=cutlass.complex_transform.none):
-        self.element = element
-        self.layout = layout
-        self.side_mode = side_mode
-        self.fill_mode = fill_mode
-        self.diag_type = diag_type
-        self.alignment = alignment
-        self.complex_transform = complex_transform
-
-###################################################################################################
-
-#
-def CalculateSmemUsagePerStage(operation):
-    """
-    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
-
-    :param op: operation for which the maximum stages should be computed. If stages are
-               set via the `op.tile_description.stages` parameter, this setting is ignored
-               in the present calculation
-    :type op: pycutlass.Operation
-
-    :return: number of bytes of shared memory consumed by a single stage
-    :rtype: int
-    """
-    m, n, k = operation.tile_description.threadblock_shape
-
-    if operation.operation_kind == OperationKind.Gemm:
-        stage_barrier_bytes = 32
-        return (DataTypeSize[operation.A.element] * m * k // 8) + \
-                         (DataTypeSize[operation.B.element] * k * n // 8) + stage_barrier_bytes
-    else:
-        raise Exception('Unsupported operation kind {}.'.format(operation.operation_kind))
-
-
-#
-def CalculateSmemUsage(operation):
-    """
-    Returns the amount of shared memory in bytes consumed by a kernel.
-
-    :param op: operation for which the maximum stages should be computed. If stages are
-               set via the `op.tile_description.stages` parameter, this setting is ignored
-               in the present calculation
-    :type op: pycutlass.Operation
-
-    :return: int
-    """
-    return operation.tile_description.stages * CalculateSmemUsagePerStage(operation)
-
-
-class ApiVersion(enum.Enum):
-    """
-    Differentiate between CUTLASS 2.x and 3.x API versions
-    """
-    v2x = enum_auto()
-    v3x = enum_auto()
-
-
-def api_version(arch, opclass, datatype):
-    """
-    Returns whether the architecture, opcode class, and datatype in question require using CUTLASS 2.x
-    or 3.x for code emission.
-
-    :param arch: compute capability of device on which to run
-    :type arch: int
-    :param opclass: class of the operation being performed
-    :type opclass: cutlass.OpClass
-    :param datatype: data type to be used in operation (assumes that ElementA and ElementB are the same)
-
-    :return: API version to be used in code emission
-    :rtype: ApiVersion
-    """
-    if arch >= 90 and opclass == cutlass.OpClass.TensorOp and (datatype != cutlass.float64):
-        return ApiVersion.v3x
-    else:
-        return ApiVersion.v2x
-
-###################################################################################################
--- a/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py
@ -1,74 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import rmm
-import numpy as np
-
-
-class PoolMemoryManager:
-    def __init__(self, init_pool_size: int, max_pool_size: int) -> None:
-        self.pool = rmm.mr.PoolMemoryResource(
-            rmm.mr.CudaMemoryResource(),
-            initial_pool_size=init_pool_size,
-            maximum_pool_size=max_pool_size
-        )
-        self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
-        rmm.mr.set_current_device_resource(self.mr)
-
-    def get_allocated_size(self):
-        return self.mr.get_allocated_bytes()
-
-    def pool_size(self):
-        return self.pool.pool_size()
-
-
-def todevice(host_data, dtype=np.float32):
-    """
-    Pass the host_data to device memory
-    """
-    if isinstance(host_data, list):
-        return rmm.DeviceBuffer.to_device(np.array(host_data, dtype=dtype).tobytes())
-    elif isinstance(host_data, np.ndarray):
-        return rmm.DeviceBuffer.to_device(host_data.tobytes())
-
-
-def device_mem_alloc(size):
-    return rmm.DeviceBuffer(size=size)
-
-
-def align_size(size, alignment=256):
-    return ((size + alignment - 1) // alignment) * alignment
-
-
-def get_allocated_size():
-    device_resource = rmm.mr.get_current_device_resource()
-    return device_resource.get_allocated_bytes()
--- a/tools/library/scripts/pycutlass/src/pycutlass/operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/operation.py
@ -1,153 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-
-import ctypes
-from cuda import cuda
-from pycutlass.utils.device import device_cc
-
-from cuda import __version__ as __cuda_version__
-_version_splits = [int(x) for x in __cuda_version__.split('.')]
-supports_cluster_launch = device_cc() >= 90 and (_version_splits[0] > 11 or (_version_splits[0] == 11 and _version_splits[1] >= 8))
-
-
-################################################################################
-#
-# Launch configuration
-#
-################################################################################
-
-
-class LaunchConfiguration:
-    def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
-        self.grid = grid
-        self.block = block
-        self.shared_memory_capacity = smem
-
-
-################################################################################
-#
-# Base class for an executable operation
-#
-# ##############################################################################
-
-class ExecutableOperation:
-    '''
-    '''
-
-    def __init__(self, operation):
-        self.operation = operation
-        self.module = None
-        self.kernel = None
-
-    #
-    def name(self):
-        return self.operation.procedural_name()
-
-    #
-    def emit(self):
-        return ''
-
-    #
-    def can_implement(self, configuration, arguments):
-        raise NotImplementedError()
-
-    #
-    def get_host_workspace_size(self, arguments):
-        raise NotImplementedError()
-
-    #
-    def get_device_workspace_size(self, arguments):
-        raise NotImplementedError()
-
-    #
-    def plan(self, arguments):
-        raise NotImplementedError()
-
-    #
-    def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=cuda.CUstream(0)):
-        raise NotImplementedError()
-
-
-    #
-    def run_with_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
-        if hasattr(self.operation, 'tile_description') and hasattr(self.operation.tile_description, 'cluster_shape'):
-            attr = cuda.CUlaunchAttribute()
-            attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.operation.tile_description.cluster_shape
-            attr.id = cuda.CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-            attrs = [attr]
-
-            # Allow for non-portable cluster sizes
-            err, = cuda.cuFuncSetAttribute(
-                self.kernel, cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                return err
-        else:
-            attrs = []
-
-        config = cuda.CUlaunchConfig()
-        config.gridDimX, config.gridDimY, config.gridDimZ = launch_config.grid
-        config.blockDimX, config.blockDimY, config.blockDimZ = launch_config.block
-        config.blockDimZ = launch_config.block[2]
-        config.sharedMemBytes = launch_config.shared_memory_capacity
-        config.hStream = stream
-        config.attrs = attrs
-        config.numAttrs = len(attrs)
-
-        err, = cuda.cuLaunchKernelEx(config, f=self.kernel, kernelParams=kernel_params, extra=0)
-        return err
-
-
-    #
-    def run_without_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
-        err, = cuda.cuLaunchKernel(
-            self.kernel,
-            launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
-            launch_config.block[0], launch_config.block[1], launch_config.block[2],
-            launch_config.shared_memory_capacity,
-            stream,
-            kernel_params,
-            0)
-
-        return err
-
-
-    #
-    def run(self, host_workspace, device_workspace, launch_config, stream=cuda.CUstream(0)):
-        cArg = (ctypes.c_char * len(host_workspace)
-                ).from_buffer(host_workspace)
-        packed = (ctypes.c_void_p * 1)()
-        packed[0] = ctypes.addressof(cArg)
-
-        if supports_cluster_launch:
-            return self.run_with_clusters(launch_config, packed, stream)
-        else:
-            return self.run_without_clusters(launch_config, packed, stream)
--- a/tools/library/scripts/pycutlass/src/pycutlass/parser.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/parser.py
@ -1,614 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-
-from typing import Generic, TypeVar
-from treelib import Tree
-import numpy as np
-
-from pycutlass import *
-import pycutlass
-
-import ast
-import textwrap
-import inspect
-
-################################################################################
-# Type annotation for input arguments
-################################################################################
-
-Ttype = TypeVar("Ttype")
-Dtype = TypeVar("Dtype")
-
-class NDArray(np.ndarray, Generic[Ttype, Dtype]):
-    pass
-
-################################################################################
-# Operations
-################################################################################
-
-operators = {
-    ast.Add: "Add",
-    ast.Div: "Div",
-    ast.Eq: "Equal",
-    ast.Mult: "Mult"
-}
-
-################################################################################
-# AST Node abstractions
-################################################################################
-class UnaryNode:
-    cnt = 0
-    # Concept: this is created by the BinOp Node in python ast
-    def __init__(self, 
-        element_accumulator, element_compute, elements_per_access,
-        node, args) -> None:
-        if isinstance(node, BinOpNode):
-            self.op = node.op
-        elif isinstance(node, ast.Call):
-            if isinstance(node.func, ast.Name):
-                self.op = node.func.id
-            elif isinstance(node.func, ast.Attribute):
-                self.op = node.func.value.id
-            else:
-                raise TypeError
-        else:
-            raise TypeError
-        self.tag = "Unary" + self.op + str(UnaryNode.cnt)
-        self.id = self.op + str(UnaryNode.cnt)
-        self.args = args
-        UnaryNode.cnt += 1
-
-        self.type = "tensor"
-
-        self.epilogue_op = getattr(pycutlass, self.op)(element_compute)
-
-        # data types
-        self.element_accumulator = element_accumulator
-        self.element_compute = element_compute
-        self.elements_per_access = elements_per_access
-    
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = UnaryOp(
-            self.element_accumulator, self.element_compute, 
-            self.elements_per_access, *visitors, self.epilogue_op)
-    
-    def get_argument(self, visitor_args, kwargs):
-        epilogue_ops = []
-        for arg in self.args:
-            try:
-                epilogue_ops.append(kwargs[arg])
-            except:
-                epilogue_ops.append(arg) # direct arguments like constant
-        self.argument = self.epilogue_node.argument_type(self.epilogue_op.argument_type(*epilogue_ops), *visitor_args)
-
-
-class BinOpNode:
-    cnt = 0
-    # Concept: this is created by the BinOp Node in python ast
-    def __init__(self, 
-        element_accumulator, element_compute, elements_per_access,
-        node) -> None:
-        self.op = operators[type(node.op)]
-        self.tag = "Binary" + self.op + str(BinOpNode.cnt)
-        self.id = self.op + str(BinOpNode.cnt)
-        self.args = None
-        BinOpNode.cnt += 1
-
-        self.type = "tensor"
-
-        self.epilogue_op = getattr(pycutlass, "Vector"+self.op)(element_compute)
-
-        # data types
-        self.element_accumulator = element_accumulator
-        self.element_compute = element_compute
-        self.elements_per_access = elements_per_access
-    
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = BinaryOp(
-            self.element_accumulator, self.element_compute, 
-            self.elements_per_access, *visitors, self.epilogue_op)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(self.epilogue_op.argument_type(self.args), *visitor_args)
-
-
-class NameNode:
-    # Concept: this is created by the Name Node in python ast
-    def __init__(self, node) -> None:
-        try:
-            self.id = node.id
-        except:
-            self.id = node.targets[0].id
-        self.tag = self.id
-
-class ScalarInputNode(NameNode):
-    # Concept: scalar
-    def __init__(self, node) -> None:
-        super().__init__(node)
-        self.tag = "Scalar:" + self.tag
-        self.type = "scalar"
-
-class AccumulatorNode(NameNode):
-    # Concept: VisitorOpAccumulator
-    def __init__(self, 
-        element_accumulator, elements_per_access, node) -> None:
-        super().__init__(node)
-        self.tag = "Accum:" + self.tag
-        self.type = "tensor"
-
-        self.element_accumulator = element_accumulator
-        self.elements_per_access = elements_per_access
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = AccumulatorOp(
-            self.element_accumulator, self.elements_per_access)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type()
-
-class TensorInputNode(NameNode):
-    # Concept: VisitorOpTensorInput
-    def __init__(self, element_accumulator, node) -> None:
-        super().__init__(node)
-        self.tag = "TensorInput:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-    
-    def get_epilogue_node(self, *args):
-        self.epilogue_node = TensorInputOp(self.element_accumulator)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            kwargs[self.id + "_ptr"], kwargs["problem_size"][1], 
-            kwargs["problem_size"][0] * kwargs["problem_size"][1])
-
-class RowBroadcastNode(NameNode):
-    # Concept: VisitorOpRowBroadcast
-    def __init__(self, element_accumulator, element_fragment, node) -> None:
-        super().__init__(node)
-        #
-        self.tag = "RowBroadcast:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_fragment = element_fragment
-    
-    def get_epilogue_node(self, *args):
-        self.epilogue_node = RowBroadcastOp(
-            self.element_accumulator, self.element_fragment)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][1])
-
-class ColumnBroadcastNode(NameNode):
-    # Concept: VisitorOpColumnBroadcast
-    def __init__(self, element_accumulator, element_fragment, node) -> None:
-        super().__init__(node)
-        self.tag = "ColumnBroadcast:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_fragment = element_fragment
-    
-    def get_epilogue_node(self, *args):
-        self.epilogue_node = ColumnBroadcastOp(
-            self.element_accumulator, self.element_fragment)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][0])
-
-class TensorOutputNode(NameNode):
-    # Concept: VisitorOpTensorOutput
-    def __init__(self, element_accumulator, node) -> None:
-        super().__init__(node)
-        self.tag = "TensorOutput:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = TensorOutputOp(self.element_accumulator, *visitors)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][1], *visitor_args, kwargs["problem_size"][0] * kwargs["problem_size"][1])
-
-class RowReductionNode:
-    # Concept: RowReductionOp
-    def __init__(self, element_accumulator, element_reduction,
-        element_reduction_accumulator, id, factor) -> None:
-        #
-        self.id = id
-        self.tag = "RowReduction:" + self.id
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_reduction = element_reduction
-        self.element_reduction_accumulator = element_reduction_accumulator
-        self.factor = factor
-    
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = RowReductionOp(
-            self.element_accumulator, self.element_reduction, 
-            self.element_reduction_accumulator, *visitors)
-    
-    def get_batch_stride(self, problem_size):
-        return problem_size[0] * ((problem_size[1] + self.factor - 1) // self.factor)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], *visitor_args, self.get_batch_stride(kwargs["problem_size"]))
-
-class ColumnReductionNode:
-    # Concept: ColumnReductionOp
-    def __init__(self, element_accumulator, element_reduction,
-        element_reduction_accumulator, id, factor) -> None:
-        #
-        self.id = id
-        self.tag = "ColumnReduction:" + self.id
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_reduction = element_reduction
-        self.element_reduction_accumulator = element_reduction_accumulator
-        self.factor = factor
-    
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = ColumnReductionOp(
-            self.element_accumulator, self.element_reduction, 
-            self.element_reduction_accumulator, *visitors)
-    
-    def get_batch_stride(self, problem_size):
-        return problem_size[1] * ((problem_size[0] + self.factor - 1) // self.factor)
-    
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(kwargs[self.id + '_ptr'], *visitor_args, self.get_batch_stride(kwargs["problem_size"]))
-
-################################################################################
-# Epilogue parser function
-################################################################################
-class EpilogueAST(ast.NodeVisitor):
-    def __init__(self, epilogue, 
-        tile_description,
-        element_accumulator, elements_per_access, 
-        element_compute, element_output) -> None:
-        #
-        
-        self.tile_description = tile_description
-        self.element_accumulator = element_accumulator
-        self.elements_per_access = elements_per_access
-        self.element_compute = element_compute
-        self.element_output = element_output
-        self.epilogue = epilogue
-
-        self.source = textwrap.dedent(inspect.getsource(epilogue.__call__))
-        self.ast_tree = ast.parse(self.source)
-        self.epilogue_tree = Tree()
-
-        
-        # print(ast.dump(self.ast_tree, indent=4)) # For Debug purpose
-
-        # input arguments
-        self.input_args = {}
-        # return nodes
-        self.returns = []
-        # reduction source nodes
-        self.reduction_source = {}
-
-        # stack used to keep the parent node id
-        self.stack = []
-
-        # visit the AST
-        self.visit(self.ast_tree)
-
-    # visit the name node
-    def visit_Name(self, node):
-        # append the return ids into self.returns
-        if self.stack[-1] == "return":
-            self.returns.append(node.id)
-        else:
-            # accum is produced from accumulator node
-            if node.id == "accum":
-                name_node = AccumulatorNode(
-                    self.element_accumulator, self.elements_per_access, node)
-            else:
-                # for input nodes
-                if node.id in self.input_args.keys():
-                    type = self.input_args[node.id][0]
-                    if type == "tensor":
-                        name_node = TensorInputNode(self.element_accumulator, node)
-                    elif type == "row":
-                        name_node = RowBroadcastNode(self.element_accumulator, self.element_compute, node)
-                    elif type == "column":
-                        name_node = ColumnBroadcastNode(self.element_accumulator, self.element_compute, node)
-                    elif type == "scalar":
-                        name_node = ScalarInputNode(node)
-                    else:
-                        raise ValueError(type)
-                # for output nodes
-                else:
-                    name_node = TensorOutputNode(self.element_accumulator, node)
-            self.epilogue_tree.create_node(name_node.tag, name_node.id, data=name_node, parent=self.stack[-1])
-    
-    def visit_Assign(self, node):
-        pre_assign_node = self.epilogue_tree.get_node(node.targets[0].id)
-        if pre_assign_node is None:
-            # The assign is to a root node
-            # skip the reduction nodes
-            if isinstance(node.value, ast.Call):
-                if isinstance(node.value.func, ast.Name):
-                    func_type = node.value.func.id
-                elif isinstance(node.value.func, ast.Attribute):
-                    func_type = node.value.func.value.id
-                else:
-                    raise TypeError
-                if func_type == 'reduction_op':
-                    self.reduction_source[node.value.args[0].id] = [node.value.args[1].value, node.value.args[2].value, node.targets[0].id]
-                    return
-            name_node = TensorOutputNode(self.element_accumulator, node)
-            self.epilogue_tree.create_node(name_node.tag, name_node.id, data=name_node)
-            self.stack.append(name_node.id)
-        else:
-            if node.targets[0].id in self.returns or node.targets[0].id in self.reduction_source.keys():
-                self.stack.append(node.targets[0].id)
-            else:
-                self.stack.append(pre_assign_node.predecessor(self.epilogue_tree.identifier))
-                self.epilogue_tree.remove_node(node.targets[0].id)
-        
-        # get child tag
-        self.visit(node.value)
-        self.stack.pop()
-    
-    def visit_Call(self, node):
-        if isinstance(node.func, ast.Name):
-            func_type = node.func.id
-        elif isinstance(node.func, ast.Attribute):
-            func_type = node.func.value.id
-        else:
-            raise TypeError
-        if func_type == "reduction_op":
-            self.visit(node.args[0])
-        else:
-            arg_list = []
-            for idx, arg in enumerate(node.args):
-                if idx == 0: continue
-                if isinstance(arg, ast.Constant):
-                    arg_list.append(arg.value)
-                elif isinstance(arg, ast.Name):
-                    arg_list.append(arg.id)
-                else:
-                    raise TypeError
-
-            unary_node = UnaryNode(self.element_accumulator, self.element_compute, self.elements_per_access, node, arg_list)
-            self.epilogue_tree.create_node(unary_node.tag, unary_node.id, parent=self.stack[-1], data=unary_node)
-            self.stack.append(unary_node.id)
-            self.visit(node.args[0])
-            self.stack.pop()
-    
-    def visit_BinOp(self, node):
-        binop = BinOpNode(self.element_accumulator, self.element_compute,
-                    self.elements_per_access, node)
-        self.epilogue_tree.create_node(binop.tag, binop.id, data=binop, parent=self.stack[-1])
-        self.stack.append(binop.id)
-        self.visit(node.left)
-        self.visit(node.right)
-        self.stack.pop()
-    
-    def visit_Return(self, node):
-        self.stack.append("return")
-        self.visit(node.value)
-        self.stack.pop()
-    
-    # # A function definition
-    def visit_FunctionDef(self, node: ast.FunctionDef):
-        # visit args
-        for arg in node.args.args:
-            if arg.arg == "self": continue
-            if isinstance(arg.annotation, ast.Constant):
-                self.input_args[arg.arg] = [arg.annotation.value, ]
-        # visit the assign in the reverse order
-        for idx in range(len(node.body)):
-            self.visit(node.body[-1-idx])
-    
-    #
-    # Tree optimization pass
-    #
-
-    # pass 1: lower Binary to Unary
-    def pass_binary_2_unary(self, tree, nid):
-        node = tree.get_node(nid)
-        if isinstance(node.data, BinOpNode):
-            lhs_node = tree.get_node(node.successors(tree.identifier)[0])
-            left_type = lhs_node.data.type
-            rhs_node = tree.get_node(node.successors(tree.identifier)[1])
-            right_type = rhs_node.data.type
-
-            if left_type == "scalar" and right_type == "tensor":
-                node.data = UnaryNode(
-                    self.element_accumulator, self.element_compute,
-                    self.elements_per_access,
-                    node.data, [lhs_node.data.id,])
-                node.tag = node.data.tag
-                tree.remove_node(lhs_node.data.id)
-                self.pass_binary_2_unary(tree, rhs_node.data.id)
-            
-            elif left_type == "tensor" and right_type == "scalar":
-                node.data = UnaryNode(
-                    self.element_accumulator, self.element_compute,
-                    self.elements_per_access,
-                    node.data, [rhs_node.id,])
-                node.tag = node.data.tag
-                tree.remove_node(rhs_node.data.id)
-                self.pass_binary_2_unary(tree, lhs_node.data.id)
-            
-            else:
-                self.pass_binary_2_unary(tree, lhs_node.data.id)
-                self.pass_binary_2_unary(tree, rhs_node.data.id)
-        else:
-            for child in node.successors(tree.identifier):
-                self.pass_binary_2_unary(tree, child)
-    
-    # pass 2: inject reduction nodes
-    def pass_inject_reduction(self, tree, nid):
-        node = tree.get_node(nid)
-        if isinstance(node.data, TensorOutputNode):
-            if node.data.id in self.reduction_source.keys():
-                direction = self.reduction_source[node.data.id][0]
-                target = self.reduction_source[node.data.id][-1]
-                if direction == 'row':
-                    reduction_node = RowReductionNode(
-                        self.element_accumulator, self.element_output,
-                        self.element_accumulator, target, self.tile_description.threadblock_shape[1])
-                elif direction == "column":
-                    reduction_node = ColumnReductionNode(
-                        self.element_accumulator, self.element_output,
-                        self.element_accumulator, target, self.tile_description.threadblock_shape[0])
-                else:
-                    raise ValueError(direction)
-                child_nid = node.successors(tree.identifier)[0]
-                # if this output node is injected only for reduction
-                if node.data.id not in self.returns:
-                    # get reduction config from disc
-                    node.data = reduction_node
-                    node.tag = reduction_node.tag
-                    self.pass_inject_reduction(tree, child_nid)
-                # if this output node is also a tensor output, inject reduction as its children
-                else:
-                    # get child node
-                    tree.create_node(reduction_node.tag, reduction_node.id, data=reduction_node, parent=node.data.id)
-                    tree.move_node(child_nid, reduction_node.id)
-                    child = tree.get_node(child_nid)
-                    for grand_child in child.successors(tree.identifier):
-                        self.pass_inject_reduction(tree, grand_child)
-            else:
-                for child in node.successors(tree.identifier):
-                    self.pass_inject_reduction(tree, child)
-        else:
-            for child in node.successors(tree.identifier):
-                self.pass_inject_reduction(tree, child)
-
-    def pass_inject_epilogue_op(self, tree, nid):
-        node = tree.get_node(nid)
-        visitors = []
-        for child in node.successors(tree.identifier):
-            visitors.append(self.pass_inject_epilogue_op(tree, child))
-        
-        node.data.get_epilogue_node(visitors)
-        return node.data.epilogue_node
-
-    def get_arguments(self, tree, nid, kwargs):
-        node = tree.get_node(nid)
-        visitor_args = []
-        for child in node.successors(tree.identifier):
-            visitor_args.append(self.get_arguments(tree, child, kwargs))
-        
-        node.data.get_argument(visitor_args, kwargs)
-        return node.data.argument
-
-class EpilogueVisitTree:
-    KernelTemplate = """
-${visitor}
-
-using ${operation_name}_EpilogueVisitor = cutlass::epilogue::threadblock::EpilogueVisitorGeneric<${visitor_name}>;
-""" 
-    def __init__(self, elementwise_functor, tile_description,
-        element_accumulator, elements_per_access, 
-        element_compute, element_output) -> None:
-        #
-        # data types
-        self.tile_description = tile_description
-        self.element_accumulator = element_accumulator
-        self.elements_per_access = elements_per_access
-        self.element_compute = element_compute
-        self.element_output = element_output
-        self.elementwise_functor = elementwise_functor
-        pass
-    
-    def initialize(self):
-        function = EpilogueAST(self, self.tile_description,
-            self.element_accumulator, self.elements_per_access,
-            self.element_compute, self.element_output)
-        #
-        tree = function.epilogue_tree
-        self.tree = tree
-        function.pass_binary_2_unary(self.tree, self.tree.root)
-        function.pass_inject_reduction(self.tree, self.tree.root)
-        function.pass_inject_epilogue_op(self.tree,self.tree.root)
-
-        visitor = self.tree.get_node(self.tree.root).data.epilogue_node
-        self.visitor = visitor
-
-        class _Argument(ctypes.Structure):
-            _fields_ = [
-                ("visitor_arg", visitor.argument_type)
-            ]
-            def __init__(self, **kwargs) -> None:
-                # process input args
-                _kwargs = {}
-                for input_key in function.input_args.keys():
-                    if input_key == "accum":
-                        continue
-                    if function.input_args[input_key][0] == "scalar": 
-                        continue
-                    # tensor input
-                    else:
-                        setattr(self, "buffer_tensor_" + input_key, NumpyFrontend.argument(kwargs[input_key], False))
-                        setattr(self, input_key + "_ptr", int(getattr(self, "buffer_tensor_" + input_key).ptr))
-                        _kwargs[input_key+"_ptr"] = getattr(self, input_key + "_ptr")
-                # process the return args
-                for ret in function.returns:
-                    setattr(self, "buffer_tensor_" + ret, NumpyFrontend.argument(kwargs[ret], True))
-                    setattr(self, ret + "_ptr", int(getattr(self, "buffer_tensor_" + ret).ptr))
-                    _kwargs[ret+"_ptr"] = getattr(self, ret + "_ptr")
-                    setattr(self, "host_tensor_" + ret, kwargs[ret])
-                
-                _kwargs.update(kwargs)
-                function.get_arguments(tree, tree.root, _kwargs)
-                self.visitor_arg = tree.get_node(tree.root).data.argument
-            
-            def sync(self, stream_sync=True):
-                if stream_sync:
-                    err, = cudart.cudaDeviceSynchronize()
-                    if err != cuda.CUresult.CUDA_SUCCESS:
-                        raise RuntimeError("CUDA Error %s" % str(err))
-                
-                for ret in function.returns:
-                    err, = cuda.cuMemcpyDtoH(
-                        getattr(self, "host_tensor_" + ret), cuda.CUdeviceptr(getattr(self, ret + "_ptr")),
-                        getattr(self, "host_tensor_" + ret).size * getattr(self, "host_tensor_" + ret).itemsize
-                    )
-                if err != cuda.CUresult.CUDA_SUCCESS:
-                    raise RuntimeError("CUDA Error %s" % str(err))
-                pass
-        
-        self.epilogue_type = _Argument
-    
-    def emit(self, operation):
-        values = {
-            'visitor': self.visitor.emit(operation),
-            'operation_name': operation.procedural_name(),
-            'visitor_name': self.visitor.instance_name
-        }
-        return SubstituteTemplate(self.KernelTemplate, values)
--- a/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py
@ -1,398 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-from pycutlass import *
-from pycutlass.c_types import get_reduction_params
-import cutlass
-from cuda import cuda
-try:
-    import torch
-    torch_available = True
-except ImportError:
-    torch_available = False
-import numpy as np
-from typing import Union
-from cuda import cudart
-
-
-class ReductionOperation:
-    pass
-
-
-class ReductionArguments:
-    """
-    Arguments of reduction
-    """
-
-    def __init__(self, operation: ReductionOperation,
-                 problem_size: 'list[int]', partitions: int,
-                 workspace: cuda.CUdeviceptr,
-                 destination: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
-                 source: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]', **kwargs) -> None:
-
-        # tensor_C can be interpreted as the bias with bias=True in keyword args
-        if "bias" in kwargs.keys():
-            self.bias = kwargs["bias"]
-        else:
-            # by default, tensor_C is not bias
-            self.bias = False
-
-        self.operation = operation
-        #: pointer to the workspace
-        self.ptr_workspace = workspace
-
-        #: number of split-k partitions
-        self.partitions = partitions
-
-        if isinstance(destination, np.ndarray):
-            self.host_D = destination
-            self.destination_buffer = NumpyFrontend.argument(destination, True)
-            self.source_buffer = NumpyFrontend.argument(source, False)
-            self.ptr_destination = cuda.CUdeviceptr(
-                self.destination_buffer.ptr)
-            self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
-        elif torch_available and isinstance(destination, torch.Tensor):
-            self.ptr_destination = TorchFrontend.argument(destination)
-            self.ptr_source = TorchFrontend.argument(source)
-        elif isinstance(destination, cuda.CUdeviceptr):
-            self.ptr_destination = destination
-            self.ptr_source = source
-        else:
-            raise TypeError("unknown Type")
-
-        self.problem_size = MatrixCoord_(
-            problem_size[0], problem_size[1]
-        )
-
-        self.partition_stride = problem_size[0] * \
-            problem_size[1] * DataTypeSize[operation.C.element] // 8
-
-        if "output_op" in kwargs.keys():
-            self.output_op = kwargs['output_op']
-        else:
-            self.output_op = self.operation.epilogue_type(1.0, 0.0)
-
-        # get arguments
-        self.get_arguments()
-
-    @staticmethod
-    def get_tensor_ref(extent: 'tuple[int]', device_ptr: cuda.CUdeviceptr, layout: cutlass.layout):
-        if layout == cutlass.RowMajor:
-            return TensorRef2D_(int(device_ptr), extent[1])
-        else:
-            raise ValueError("unknown layout type")
-
-    def get_arguments(self):
-        ref_workspace = ReductionArguments.get_tensor_ref(
-            extent=[self.problem_size.row, self.problem_size.column],
-            device_ptr=self.ptr_workspace, layout=cutlass.RowMajor)
-        if self.bias:
-            ref_source = ReductionArguments.get_tensor_ref(
-                extent=[0, 0],
-                device_ptr=self.ptr_source, layout=cutlass.RowMajor)
-        else:
-            ref_source = ReductionArguments.get_tensor_ref(
-                extent=[self.problem_size.row, self.problem_size.column],
-                device_ptr=self.ptr_source, layout=cutlass.RowMajor)
-
-        ref_destination = ReductionArguments.get_tensor_ref(
-            extent=[self.problem_size.row, self.problem_size.column],
-            device_ptr=self.ptr_destination, layout=cutlass.RowMajor)
-
-
-        self.c_arguments = self.operation.argument_type(
-            self.problem_size, self.partitions,
-            self.partition_stride, ref_workspace,
-            ref_destination, ref_source,
-            self.output_op
-        )
-
-        params_ = self.operation.rt_module.get_args(
-            ctypes.byref(self.c_arguments))
-        self.host_workspace = bytearray(params_.contents)
-
-    def sync(self):
-        err, = cudart.cudaDeviceSynchronize()
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
-
-        if hasattr(self, "host_D"):
-            err, = cuda.cuMemcpyDtoH(
-                self.host_D, self.ptr_destination, self.host_D.size * self.host_D.itemsize)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
-
-    def free(self):
-        if hasattr(self, "destination_buffer"):
-            del self.destination_buffer
-        if hasattr(self, "source_buffer"):
-            del self.source_buffer
-
-
-class ReductionRT(ExecutableOperation):
-    """
-    ReductionRT manages the CUTLASS runtime components for reduction
-    """
-    KernelTemplate = r'''
-extern "C"
-__global__ void
-${operation_name}(${operation_name}${operation_suffix}::Params params) {
-
-  // Dynamic shared memory base pointer
-  extern __shared__ int SharedStorageBase[];
-
-  // Declare pointer to dynamic shared memory.
-  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
-      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
-
-  ${operation_name}${operation_suffix} op;
-
-  op(params, *shared_storage);
-}
-    '''
-    HostTemplate = r'''
-extern "C" {
-  // Get the size of params in bytes
-  int ${operation_name}_get_param_size(){
-    return sizeof(${operation_name}${operation_suffix}::Params);
-  }
-
-  // Get the size of dynamic shared memory in bytes
-  int ${operation_name}_shared_memory_size() {
-    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
-  }
-
-  // Get the params as byte array
-  char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
-    char *bytes = ((char*)(params));
-    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
-    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
-        output[i] = bytes[i];
-
-    return output;
-  }
-}
-    '''
-
-    def __init__(self, operation: ReductionOperation):
-        super().__init__(operation)
-
-        self.operation: ReductionOperation = operation
-        self.emitter = EmitReductionInstance('_type')
-
-        self.elements_per_access = self.operation.count
-        self.argument_type, self.epilogue_type = get_reduction_params(operation.epilogue_functor)
-        self.argtype = [ctypes.POINTER(self.argument_type)]
-
-    def emit(self):
-        return self.emitter.emit(self.operation)
-
-    def plan(self, arguments: ReductionArguments):
-        block_shape = [self.operation.shape.column(
-        ) // self.elements_per_access, self.operation.shape.row(), 1]
-        grid_shape = [
-            (arguments.problem_size.row + self.operation.shape.row() -
-             1) // self.operation.shape.row(),
-            (arguments.problem_size.column + self.operation.shape.column() -
-                1) // self.operation.shape.column(),
-            1
-        ]
-        return LaunchConfiguration(grid_shape, block_shape, self.shared_memory_capacity)
-
-    def initialize(self):
-        err, = cuda.cuFuncSetAttribute(
-            self.kernel,
-            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            value=self.shared_memory_capacity)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('Cuda Error: {}'.format(err))
-
-
-class ReductionOperation:
-    """
-    CUTLASS Reduction Operation
-    shape: shape of CTA
-    outputop: output operator
-    r
-    """
-
-    def __init__(self, shape: cutlass.MatrixCoord, C: TensorDescription,
-                 element_accumulator, element_workspace=None,
-                 element_compute=None, epilogue_functor=None,
-                 count: int = 1, partitions_per_stage: int = 4) -> None:
-        """ Constructor
-        """
-
-        self.shape = shape
-        #: epilogue functor (default: LinearCombination)
-        self.epilogue_functor = epilogue_functor
-        #: datatype of accumulator
-        self.element_accumulator = element_accumulator
-
-        if element_workspace is None:
-            #: datatype of workspace
-            self.element_workspace = element_accumulator
-        else:
-            #: datatype of workspace
-            self.element_workspace = element_workspace
-
-        if element_compute is None:
-            #: datatype of workspace
-            self.element_compute = element_accumulator
-        else:
-            #: datatype of workspace
-            self.element_compute = element_compute
-
-        #: datatype of output
-        self.element_output = C.element
-
-        #: operand C
-        self.C: TensorDescription = C
-
-        #: reduce op processing size
-        self.count: int = count
-
-        #: number of partitions to reduce per stage
-        self.partitions_per_stage: int = partitions_per_stage
-
-        self.rt_module: ReductionRT = ReductionRT(self)
-        self.argument_type = self.rt_module.argument_type
-        self.epilogue_type = self.rt_module.epilogue_type
-
-    #
-    def extended_name(self):
-        extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
-
-        return SubstituteTemplate(extend_name,
-                                  {
-                                      'element_workspace': DataTypeNames[self.element_workspace],
-                                      'element_accumulator': DataTypeNames[self.element_accumulator],
-                                      'element_compute': DataTypeNames[self.element_compute],
-                                      'element_output': DataTypeNames[self.element_output]
-                                  })
-
-    #
-    def configuration_name(self):
-        ''' The full procedural name indicates architecture, extended name, tile size'''
-
-        configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
-
-        threadblock = "%dx%d" % (
-            self.shape.row(),
-            self.shape.column()
-        )
-
-        return SubstituteTemplate(
-            configuration_name,
-            {
-                'extended_name': self.extended_name(),
-                'threadblock': threadblock
-            }
-        )
-
-    #
-    def procedural_name(self):
-        ''' The full procedural name indicates architecture, extended name, tile size'''
-        return self.configuration_name()
-
-    def run(self, arguments: ReductionArguments) -> cuda.CUresult:
-        """
-        Configure and launch the cuda kernel with input arguments
-        """
-        # get launch configuration
-        launch_config = self.rt_module.plan(arguments)
-
-        # get the host and device workspace
-        host_workspace = arguments.host_workspace
-        device_workspace = None
-
-        # launch the kernel
-        err = self.rt_module.run(
-            host_workspace, device_workspace, launch_config)
-
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('CUDA Error %s' % str(err))
-
-        return err
-
-
-class EmitReductionInstance:
-    def __init__(self, operation_suffix='') -> None:
-        self.operation_suffix = operation_suffix
-        self.includes = [
-            "cutlass/cutlass.h",
-            "cutlass/numeric_types.h",
-            "cutlass/arch/arch.h",
-            "cutlass/arch/mma.h",
-            "cutlass/layout/matrix.h",
-            "cutlass/gemm/device/gemm.h",
-            "cutlass/gemm/device/gemm_universal_adapter.h",
-            "cutlass/gemm/kernel/default_gemm_universal.h",
-            "cutlass/reduction/kernel/reduce_split_k.h",
-            "cutlass/reduction/thread/reduction_operators.h"
-        ]
-        self.template = """
-// Reduction kernel instance
-using ${operation_name}_base = 
-typename cutlass::reduction::kernel::ReduceSplitK<
-  cutlass::MatrixShape<${shape_row}, ${shape_column}>,
-  ${epilogue_functor},
-  cutlass::reduction::thread::ReduceAdd<
-    ${element_accumulator},
-    ${element_output},
-    ${count}>,
-  ${partition_per_stage}>;
-
-struct ${operation_name}${operation_suffix}:
-  public ${operation_name}_base { };
-      """
-
-    def emit(self, operation: ReductionOperation):
-
-        epilogue_vector_length = int(min(
-            operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
-
-        values = {
-            'operation_name': operation.configuration_name(),
-            'operation_suffix': self.operation_suffix,
-            'shape_row': str(operation.shape.row()),
-            'shape_column': str(operation.shape.column()),
-            'epilogue_functor': operation.epilogue_functor.emit(),
-            'element_output': DataTypeTag[operation.element_output],
-            'epilogue_vector_length': str(epilogue_vector_length),
-            'element_accumulator': DataTypeTag[operation.element_accumulator],
-            'element_compute': DataTypeTag[operation.element_compute],
-            'element_workspace': DataTypeTag[operation.element_workspace],
-            'count': str(operation.count),
-            'partition_per_stage': str(operation.partitions_per_stage)
-        }
-
-        return SubstituteTemplate(self.template, values)
--- a/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py
@ -1,70 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-
-from typeguard import typechecked
-import numpy as np
-try:
-    import torch
-    torch_available = True
-except ImportError:
-    torch_available = False
-from cuda import cuda
-try:
-    import cupy as cp
-    cupy_available = True
-except ImportError:
-    cupy_available = False
-import cutlass
-
-
-# @typechecked
-class TensorRef:
-    """
-    Python Wrapper for cutlass.TensorRef
-    """
-    def __init__(self, tensor, dtype, layout) -> None:
-        if isinstance(tensor, np.ndarray):
-            ptr = cuda.CUdeviceptr(tensor.__array_interface__['data'][0])
-        elif torch_available and isinstance(tensor, torch.Tensor):
-            ptr = cuda.CUdeviceptr(tensor.data_ptr())
-        elif cupy_available and isinstance(tensor, cp.ndarray):
-            ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
-        elif isinstance(tensor, cuda.CUdeviceptr):
-            ptr = tensor
-        elif isinstance(tensor, int):
-            ptr = cuda.CUdeviceptr(tensor)
-        else:
-            raise NotImplementedError(tensor)
-        
-        # the dtype(0) is used to overload between different data types 
-        # with the same layout
-        self.tensor_ref = cutlass.get_tensor_ref(int(ptr), dtype(0), layout)
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/init.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/init.py
@ -1,4 +0,0 @@
-from pycutlass.test.profiler import *
-from pycutlass.test.conv2d_testbed import *
-from pycutlass.test.gemm_testbed import *
-from pycutlass.test.gemm_grouped_testbed import *
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py
@ -1,632 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import pycutlass
-from pycutlass import *
-from pycutlass.test import *
-from time import sleep
-from bfloat16 import bfloat16
-import subprocess
-from typeguard import typechecked
-import re
-
-
-
-def getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand):
-    ptr = tensor.__array_interface__['data'][0]
-    if operand == "a":
-        tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
-    elif operand == "b":
-        tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
-    elif operand in ["c", "d"]:
-        tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
-    else:
-        raise ValueError("unknown operand: " + operand)
-    
-    layout = tensor_layout.packed(tensor_coord)
-
-    if tensor.dtype == np.float64:
-        return cutlass.TensorRefF64NHWC(ptr, layout)
-    elif tensor.dtype == np.float32:
-        return cutlass.TensorRefF32NHWC(ptr, layout)
-    elif tensor.dtype == np.float16:
-        return cutlass.TensorRefF16NHWC(ptr, layout)
-    if tensor.dtype == bfloat16:
-        return cutlass.TensorRefBF16NHWC(ptr, layout)
-    elif tensor.dtype == np.int32:
-        return cutlass.TensorRefS32NHWC(ptr, layout)
-    elif tensor.dtype == np.int8:
-        if tensor_layout == cutlass.TensorNC32HW32:
-            return cutlass.TensorRefS8NC32HW32(ptr, layout)
-        elif tensor_layout == cutlass.TensorC32RSK32:
-            return cutlass.TensorRefS8C32RSK32(ptr, layout)
-        else:
-            return cutlass.TensorRefS8NHWC(ptr, layout)
-    else:
-        raise ValueError("unsupported data type")
-
-def getTensorView(tensor, tensor_layout, conv_kind, problem_size, operand):
-    tensor_ref = getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand)
-
-    if operand == "a":
-        tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
-    elif operand == "b":
-        tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
-    elif operand in ["c", "d"]:
-        tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
-    else:
-        raise ValueError("unknown operand: " + operand)
-
-    if tensor.dtype == np.float64:
-        return cutlass.TensorViewF64NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.float32:
-        return cutlass.TensorViewF32NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.float16:
-        return cutlass.TensorViewF16NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == bfloat16:
-        return cutlass.TensorViewBF16NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.int32:
-        return cutlass.TensorViewS32NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.int8:
-        if tensor_layout == cutlass.TensorNC32HW32:
-            return cutlass.TensorViewS8NC32HW32(tensor_ref, tensor_coord)
-        elif tensor_layout == cutlass.TensorC32RSK32:
-            return cutlass.TensorViewS8C32RSK32(tensor_ref, tensor_coord)
-        else:
-            return cutlass.TensorViewS8NHWC(tensor_ref, tensor_coord)
-        
-    else:
-        raise ValueError("unsupported data type")
-
-
-
-# @typechecked
-class Conv2dLauncher:
-    """
-    Launcher that runs the operation on given problem size
-    """
-    def __init__(self, operation: 'Conv2dOperation', seed: int=2080, interleaved=False,
-        verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
-
-        self.enable_cached_results = True
-        self.interleaved = interleaved
-
-        # create the reduction kernel
-        self.reduction_operation = ReductionOperation(
-            shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
-            C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
-            element_compute=operation.epilogue_functor.element_epilogue, epilogue_functor=operation.epilogue_functor,
-            count=operation.C.alignment
-        )
-
-        #: verify the output result
-        self.verification = verification
-        #: profile the kernel's runtime
-        self.profiling = profiling
-
-        self.timer = GpuTimer()
-
-        self.warmup_iterations = warmup_iterations
-        self.iterations = iterations
-
-        if "sleep" in kwargs.keys():
-            self.sleep_time = kwargs["sleep"]
-        else:
-            self.sleep_time = 0
-        
-        #
-        # Compile the operator
-        #
-
-        pycutlass.compiler.add_module([operation, self.reduction_operation])
-
-        self.operation = operation
-
-        self.dtype_A = Conv2dLauncher.numpy_type(operation.A.element)
-        self.layout_A = operation.A.layout
-        self.dtype_B = Conv2dLauncher.numpy_type(operation.B.element)
-        self.layout_B = operation.B.layout
-        self.dtype_C = Conv2dLauncher.numpy_type(operation.C.element)
-        self.layout_C = operation.C.layout
-        self.dtype_D = Conv2dLauncher.numpy_type(operation.C.element)
-        self.layout_D = operation.C.layout
-
-        accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
-        element_size = DataTypeSize[operation.A.element]
-
-        if element_size <= 8:
-            self.scope = 1
-        elif element_size == 16:
-            if accumulator_size <= 16:
-                self.scope = 2
-            else:
-                self.scope = 4
-        else:
-            self.scope = 7
-
-        # Seed
-        self.seed = seed
-
-        self.conv_kind = operation.conv_kind
-        
-
-        #
-        # Get the host reference function
-        #
-
-        self.element_compute = operation.epilogue_functor.element_epilogue
-
-        self.host_conv2d = cutlass.test.conv.host.conv2d
-
-        self.timer = GpuTimer()
-
-    @staticmethod
-    def numpy_type(type):
-        if type == cutlass.float64:
-            return np.float64
-        elif type == cutlass.float32:
-            return np.float32
-        elif type == cutlass.float16:
-            return np.float16
-        elif type == cutlass.bfloat16:
-            return bfloat16
-        elif type == cutlass.int32:
-            return np.int32
-        elif type == cutlass.int8:
-            return np.int8
-        else:
-            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
-
-    def print_problem_size(self, p, split_k_mode=1):
-        print("nhwc_%dx%dx%dx%d_krsc_%dx%dx%dx%d_padding_%dx%d_stride_%dx%d_dilation_%dx%d_splitkslices_%d_splitkmode_%d"
-         % (p.N, p.H, p.W, p.C, p.K, p.R, p.S, p.C, p.pad_h,
-          p.pad_w, p.stride_h, p.stride_w, p.dilation_h, p.dilation_w, p.split_k_slices, split_k_mode))
-    
-    def uniform_init(self, size, dtype):
-        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
-            return np.ceil(
-                np.random.uniform(
-                    low=-self.scope - 0.5, high=self.scope - 0.5, 
-                    size=size).astype(dtype)
-                )
-        else:
-            return np.random.uniform(
-                low=-self.scope - 1, high=self.scope + 1, 
-                size=size).astype(dtype)
-    
-    def eq_gemm_size(self, problem_size):
-        n = problem_size.N
-        p = problem_size.P
-        q = problem_size.Q
-        k = problem_size.K
-        r = problem_size.R
-        s = problem_size.S
-        c = problem_size.C
-        h = problem_size.H
-        w = problem_size.W
-        if self.conv_kind == cutlass.conv.Operator.fprop:
-            return cutlass.gemm.GemmCoord(n * p * q, k, r * s * c)
-        elif self.conv_kind == cutlass.conv.Operator.dgrad:
-            return cutlass.gemm.GemmCoord(n * h * w, c, k * r * s)
-        else:
-            return cutlass.gemm.GemmCoord(k, r * s * c, n * p * q)
-    
-    def bytes(self, problem_size, alpha, beta):
-        mnk = self.eq_gemm_size(problem_size)
-
-        bytes_ = \
-            (DataTypeSize[self.operation.A.element] * mnk.m() // 8) * mnk.k() + \
-            (DataTypeSize[self.operation.B.element] * mnk.n() // 8) * mnk.k() + \
-            (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
-
-        if beta != 0:
-            bytes_ += (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
-        
-        return bytes_
-    
-    def flops(self, problem_size):
-        mnk = self.eq_gemm_size(problem_size)
-
-        flops_mainloop_ = mnk.m() * mnk.n() * mnk.k() * 2
-        flops_epilogue_ = mnk.m() * mnk.n() * 2
-
-        # Adjust mainloop flop for dgrad stride
-        if self.conv_kind == cutlass.conv.Operator.dgrad:
-            flops_mainloop_ = flops_mainloop_ // (problem_size.stride_h * problem_size.stride_w)
-        
-        flops_total_ = flops_mainloop_ + flops_epilogue_
-        
-        return flops_total_
-
-
-    
-    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
-        if self.element_compute == cutlass.float16:
-            alpha = cutlass.float16(alpha)
-            beta = cutlass.float16(beta)
-        elif self.element_compute == cutlass.int32:
-            alpha = int(alpha)
-            beta = int(beta)
-        else:
-            alpha = alpha
-            beta = beta
-
-        # if cached result is loaded
-        cached_result_loaded = False
-
-        if self.enable_cached_results:
-            # get problem key
-            cached_test_key = cutlass.test.conv.host.CreateCachedConv2dTestKey(
-                self.conv_kind, problem_size, alpha, beta, 
-                getTensorView(tensor_A, self.layout_A, self.conv_kind, problem_size, "a"),
-                getTensorView(tensor_B, self.layout_B, self.conv_kind, problem_size, "b"),
-                getTensorView(tensor_C, self.layout_C, self.conv_kind, problem_size, "c"),
-            )
-
-            cached_test_result = cutlass.test.conv.host.CachedTestResult()
-
-            conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (self.operation.arch, self.seed)
-
-            cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
-            # CachedTestResultListing cached_results(conv2d_result_cache_name);
-            cached = cached_results.find(cached_test_key)
-            cached_result_loaded = cached[0]
-            if cached_result_loaded :
-                cached_test_result = cached[1]
-        
-        if not cached_result_loaded:
-            # compute the conv2d on host
-            tensor_D_ref = np.ones_like(tensor_C)
-            tensor_ref_A = getTensorRef(tensor_A, self.layout_A, self.conv_kind, problem_size, "a")
-            tensor_ref_B = getTensorRef(tensor_B, self.layout_B, self.conv_kind, problem_size, "b")
-            tensor_ref_C = getTensorRef(tensor_C, self.layout_C, self.conv_kind, problem_size, "c")
-            tensor_ref_D_ref = getTensorRef(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
-
-            self.host_conv2d(
-                self.conv_kind, problem_size, 
-                tensor_ref_A, tensor_ref_B, tensor_ref_C, tensor_ref_D_ref,
-                alpha, beta
-            )
-
-            tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
-
-            if self.enable_cached_results:
-                cached_test_result.D = cutlass.test.conv.host.TensorHash(tensor_view_D_ref)
-                cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
-                cached_results.append(cached_test_key, cached_test_result)
-                cached_results.write(conv2d_result_cache_name)
-            else:
-                return tensor_D_ref
-
-        return cached_test_result.D
-    
-    def equal(self, tensor_D, tensor_D_ref, problem_size):
-        if self.enable_cached_results:
-            tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
-            tensor_D_hash = cutlass.test.conv.host.TensorHash(tensor_view_D)
-
-            return tensor_D_hash == tensor_D_ref
-        else:
-            tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
-            tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
-            return cutlass.test.conv.host.equals(tensor_view_D, tensor_view_D_ref)
-    
-    def run_cutlass_profiler(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial, alpha=1.0, beta=0.0):
-
-        if split_k_mode == cutlass.conv.SplitKMode.Serial:
-            split_k_mode_ = "serial"
-        else:
-            split_k_mode_ = "parallel"
-
-        cutlass_path = os.getenv('CUTLASS_PATH')
-        assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
-
-        values = {
-            "profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
-            "kernel_name": self.operation.procedural_name(),
-            "verification_providers": "device",
-            "provider": "cutlass",
-            'n': str(problem_size.N),
-            'h': str(problem_size.H),
-            'w': str(problem_size.W),
-            'c': str(problem_size.C),
-            'k': str(problem_size.K),
-            'r': str(problem_size.R),
-            's': str(problem_size.S),
-            'p': str(problem_size.P),
-            'q': str(problem_size.Q),
-            'pad_h': str(problem_size.pad_h),
-            'pad_w': str(problem_size.pad_w),
-            'stride_h': str(problem_size.stride_h),
-            'stride_w': str(problem_size.stride_w),
-            'dilation_h': str(problem_size.dilation_h),
-            'dilation_w': str(problem_size.dilation_w),
-            'split_k_slices': str(problem_size.split_k_slices),
-            'split_k_mode': split_k_mode_,
-            'alpha': str(alpha),
-            'beta': str(beta),
-            'warmup': str(self.warmup_iterations),
-            'profile': str(self.iterations)
-        }
-
-        cmd_template = \
-            "${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
-            " --providers=${provider} --n=${n} --h=${h} --w=${w} --c=${c} --k=${k} --r=${r} --s=${s} --p=${p}" \
-            " --q=${q} --pad_h=${pad_h} --pad_w=${pad_w} --stride_h={stride_h} --stride_w=${stride_w}" \
-            " --dilation_h=${dilation_h} --dilation_w=${dilation_w} --warmup-iterations=${warmup} --profiling-iterations=${profile}" \
-            " --split_k_slices=${split_k_slices} --alpha=${alpha} --beta=${beta} --split_k_mode=${split_k_mode}"
-        
-        cmd = SubstituteTemplate(cmd_template, values)
-        result = subprocess.getoutput(cmd)
-
-        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
-        runtime = float(m.group('runtime'))
-
-        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
-        bytes = int(m.group('bytes'))
-
-        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
-        flops = int(m.group('flops'))
-
-        # check if the problem size matches
-        assert bytes == self.bytes(problem_size, alpha, beta)
-        assert flops == self.flops(problem_size)
-
-        return runtime
-
-
-
-    def run(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial,
-        alpha=1.0, beta=0.0):
-
-        assert get_allocated_size() == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
-
-        #
-        # Initialize input and output tensors
-        #
-        tensor_A_size = cutlass.conv.implicit_gemm_tensor_a_size(self.conv_kind, problem_size)
-        tensor_B_size = cutlass.conv.implicit_gemm_tensor_b_size(self.conv_kind, problem_size)
-        tensor_C_size = cutlass.conv.implicit_gemm_tensor_c_size(self.conv_kind, problem_size)
-        
-        np.random.seed(self.seed)
-
-        tensor_A = self.uniform_init(size=(tensor_A_size,), dtype=self.dtype_A)
-        tensor_B = self.uniform_init(size=(tensor_B_size,), dtype=self.dtype_B)
-        tensor_C = self.uniform_init(size=(tensor_C_size,), dtype=self.dtype_C)
-        tensor_D = np.zeros(shape=(tensor_C_size,), dtype=self.dtype_D)
-        
-
-        #
-        # Launch kernel
-        #
-
-        arguments = Conv2dArguments(
-            operation=self.operation, problem_size=problem_size, A=tensor_A,
-            B=tensor_B, C=tensor_C, D=tensor_D, 
-            output_op = self.operation.epilogue_type(alpha, beta), 
-            split_k_slices=problem_size.split_k_slices,
-            split_k_mode=split_k_mode
-        )
-
-        if split_k_mode == cutlass.conv.SplitKMode.Parallel:
-            implicit_gemm_size = cutlass.conv.implicit_gemm_problem_size(self.operation.conv_kind, arguments.problem_size)
-            reduction_arguments = ReductionArguments(
-                self.reduction_operation,
-                problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()], partitions=problem_size.split_k_slices,
-                workspace=arguments.ptr_D,
-                destination=tensor_D,
-                source=tensor_C,
-                output_op = self.reduction_operation.epilogue_type(alpha, beta)
-            )
-
-        self.operation.run(arguments)
-        if split_k_mode == cutlass.conv.SplitKMode.Parallel:
-            self.reduction_operation.run(reduction_arguments)
-        
-        passed = True
-        if self.verification:
-            if split_k_mode == cutlass.conv.SplitKMode.Parallel:
-                reduction_arguments.sync()
-            else:
-                arguments.sync()
-
-            tensor_D_ref = self.host_reference(problem_size, tensor_A, tensor_B, tensor_C, alpha, beta)
-            
-            passed = self.equal(tensor_D, tensor_D_ref, problem_size)
-
-            try: 
-                assert passed
-            except AssertionError:
-                self.print_problem_size(problem_size, split_k_mode)
-        
-        if self.profiling:
-            sleep(self.sleep_time)
-            for _ in range(self.warmup_iterations):
-                self.operation.run(arguments)
-                if split_k_mode == cutlass.conv.SplitKMode.Parallel:
-                    self.reduction_operation.run(reduction_arguments)
-            
-            self.timer.start()
-            for _ in range(self.warmup_iterations):
-                self.operation.run(arguments)
-                if split_k_mode == cutlass.conv.SplitKMode.Parallel:
-                    self.reduction_operation.run(reduction_arguments)
-            self.timer.stop_and_wait()
-            runtime = self.timer.duration(self.iterations)
-        
-        # free memory
-        del arguments
-        if split_k_mode == cutlass.conv.SplitKMode.Parallel:
-            del reduction_arguments
-        
-        assert get_allocated_size() == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
-        if self.profiling:
-            return runtime
-        return passed
-
-
-
-########################################################################################################
-# TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
-# TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-# Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
-# (conv_blacklist_sizes)
-############################################################################################################
-
-def test_all_conv2d(operation: Conv2dOperation, conv_test_sizes = [], interleaved=False):
-    passed = True
-    #
-    # Testbed object
-    #
-
-    testbed = Conv2dLauncher(operation, interleaved=interleaved)
-
-    #
-    # Get conv problem sizes to run conv operator
-    #
-
-    conv_problems = cutlass.test.conv.TestbedConv2dProblemSizes(64)
-
-    # Vector of conv2d problem sizes to avoid duplicate runs
-    conv_tested_sizes = []
-
-    # Flatten 2D problem_vectors into a 1D problem sizes
-    problem_sizes = conv_problems.conv2d_default_sizes
-    
-    problem_sizes = [conv_problem for conv_problem in problem_sizes] + conv_test_sizes
-
-    # Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slices=1, alpha=1.0, beta=0.0)
-    for conv_problem in problem_sizes:
-
-        if conv_problem in conv_tested_sizes:
-            continue
-            
-        # skip channel dimension % 32 != 0 for interleaved case
-        if interleaved:
-            if conv_problem.K % 32 != 0 or conv_problem.C % 32 != 0:
-                continue
-    
-        #
-        # Procedurally disable certain cases
-        #
-
-        # CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
-        if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Unity:
-            if not ((conv_problem.stride_h == 1) and (conv_problem.stride_w == 1)):
-                continue
-        
-        if not interleaved:
-            # Fixed channels algorithm requires channel count to match access size
-            if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.fixed_channels:
-                if conv_problem.C != operation.A.alignment:
-                    continue
-            
-            # Few channels algorithm requires channel count to match access size
-            if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.few_channels:
-                if conv_problem.C % operation.A.alignment:
-                    continue
-            
-            # CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w} 
-            # Although strided dgrad works for all stride combinations, we are only going 
-            # to run strided dgrad for non-unity strides 
-
-            if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
-                if (conv_problem.stride_h == 1) and (conv_problem.stride_w == 1):
-                    continue
-            
-        #
-        # Test
-        #
-
-        # push back tested problem size to avoid re-running duplicates
-        conv_tested_sizes.append(conv_problem)
-
-        passed = testbed.run(conv_problem)
-
-        if not passed:
-            return False
-
-    if interleaved:
-        return True
-    #
-    # filter the cases for split K
-    #
-
-    # Small-channels convolution can't run here.
-    if operation.iterator_algorithm in [cutlass.conv.IteratorAlgorithm.fixed_channels, cutlass.conv.IteratorAlgorithm.few_channels]:
-        return True
-    
-    # CUTLASS DGRAD's *stride* specialization does not support split-k mode
-    if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
-        conv_problem = cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 56, 56, 8),
-            cutlass.Tensor4DCoord(8, 1, 1, 8),
-            cutlass.Tensor4DCoord(0, 0, 0, 0),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        )
-        passed = testbed.run(conv_problem)
-
-        return passed
-    
-    # Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
-    # a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-    # which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
-    # alpha and beta for local testing, but only runs one value for alpha and beta.
-
-    conv2d_split_k_test_size = cutlass.conv.Conv2dProblemSize(
-        cutlass.Tensor4DCoord(1, 17, 11, 288),
-        cutlass.Tensor4DCoord(160, 3, 3, 288),
-        cutlass.Tensor4DCoord(1, 1, 1, 1),
-        cutlass.MatrixCoord(1, 1),
-        cutlass.MatrixCoord(1, 1),
-        cutlass.conv.Mode.cross_correlation,
-        1, 1
-    )
-
-    split_k_modes = [cutlass.conv.SplitKMode.Parallel, cutlass.conv.SplitKMode.Serial]
-
-    split_k_slices = [1, 2, 3, 4, 201]
-    problem_alpha = [2.0,]
-    problem_beta = [2.0,]
-
-    for split_k_mode in split_k_modes:
-        for split_k_slice in split_k_slices:
-            for alpha in problem_alpha:
-                for beta in problem_beta:
-                    passed = testbed.run(conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
-                    split_k_mode,
-                    alpha, beta)
-                
-    return passed
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py
@ -1,235 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import pycutlass
-from pycutlass.test.gemm_testbed import getTensorRef, getTensorView, transpose
-from pycutlass import *
-import numpy as np
-import cutlass
-from bfloat16 import bfloat16
-
-
-class TestbedGrouped:
-    def __init__(self, operation: GemmOperationGrouped, seed: int = 2080) -> None:
-
-        pycutlass.compiler.add_module([operation])
-
-        self.seed = seed
-
-        self.operation = operation
-
-        element_size = DataTypeSize[operation.A.element]
-
-        self.dtype_A = self.numpy_type(operation.A.element)
-        self.dtype_B = self.numpy_type(operation.B.element)
-        self.dtype_C = self.numpy_type(operation.C.element)
-        self.dtype_D = self.numpy_type(operation.C.element)
-
-        if element_size == 1:
-            self.scope_max = 1
-            self.scope_min = 0
-        elif element_size <= 8:
-            self.scope_max = 1
-            self.scope_min = -1
-        elif element_size == 16:
-            self.scope_max = 4
-            self.scope_min = -4
-        else:
-            self.scope_max = 8
-            self.scope_min = -8
-
-        #: compute type
-        self.compute_type = operation.epilogue_functor.element_epilogue
-
-        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
-
-    @staticmethod
-    def numpy_type(type):
-        if type == cutlass.float64:
-            return np.float64
-        elif type == cutlass.float32:
-            return np.float32
-        elif type == cutlass.float16:
-            return np.float16
-        elif type == cutlass.bfloat16:
-            return bfloat16
-        elif type == cutlass.int32:
-            return np.int32
-        elif type == cutlass.int8:
-            return np.int8
-        else:
-            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
-
-    def uniform_init(self, size, dtype):
-        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
-            return np.ceil(
-                np.random.uniform(
-                    low=self.scope_min - 0.5, high=self.scope_max - 0.5,
-                    size=size).astype(dtype)
-            )
-        else:
-            return np.random.uniform(
-                low=self.scope_min - 1, high=self.scope_max + 1,
-                size=size).astype(dtype)
-
-    def print_problem_size(self, p):
-        problem_size = "problem: %d, %d, %d\n" % (p.m(), p.n(), p.k())
-        print(problem_size)
-
-    def run(self, problem_count: int, alpha: float = 1.0, beta: float = 0.0) -> bool:
-
-        assert get_allocated_size(
-        ) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
-
-        # initialize
-        np.random.seed(self.seed)
-
-        # generate the problem sizes
-        problem_sizes = []
-        tensor_As = []
-        tensor_Bs = []
-        tensor_Cs = []
-        tensor_Ds = []
-        tensor_D_refs = []
-
-        for i in range(problem_count):
-            if self.dtype_A == np.int8:
-                if i == 0:
-                    problem_size = cutlass.gemm.GemmCoord(48, 16, 32)
-                else:
-                    problem_size = cutlass.gemm.GemmCoord(
-                        16 * np.random.randint(0, 64) + 48,
-                        16 * np.random.randint(0, 64) + 48,
-                        16 * np.random.randint(0, 64) + 48
-                    )
-            else:
-                if i == 0:
-                    problem_size = cutlass.gemm.GemmCoord(48, 16, 8)
-                else:
-                    problem_size = cutlass.gemm.GemmCoord(
-                        8 * np.random.randint(0, 64) + 24,
-                        8 * np.random.randint(0, 64) + 24,
-                        8 * np.random.randint(0, 64) + 24
-                    )
-
-            tensor_As.append(
-                self.uniform_init(
-                    size=(problem_size.m() * problem_size.k(),),
-                    dtype=self.dtype_A)
-            )
-            tensor_Bs.append(
-                self.uniform_init(
-                    size=(problem_size.n() * problem_size.k(),),
-                    dtype=self.dtype_B)
-            )
-            tensor_Cs.append(
-                self.uniform_init(
-                    size=(problem_size.m() * problem_size.n(),),
-                    dtype=self.dtype_C)
-            )
-
-            tensor_Ds.append(
-                np.zeros(
-                    shape=(problem_size.m() * problem_size.n(),),
-                    dtype=self.dtype_D
-                )
-            )
-
-            tensor_D_refs.append(
-                np.ones(
-                    shape=(problem_size.m() * problem_size.n(),),
-                    dtype=self.dtype_D
-                )
-            )
-
-            problem_sizes.append(problem_size)
-
-        arguments = GemmGroupedArguments(
-            operation=self.operation, problem_sizes=problem_sizes,
-            A=tensor_As, B=tensor_Bs, C=tensor_Cs, D=tensor_Ds,
-            output_op=self.operation.epilogue_type(alpha, beta)
-        )
-
-        self.operation.run(arguments)
-
-        arguments.sync()
-
-        #
-        # Reference check
-        #
-        alpha = self.compute_type(alpha).value()
-        beta = self.compute_type(beta).value()
-        init_acc = self.accumulator_type(0).value()
-
-        for idx, problem_size in enumerate(problem_sizes):
-            if self.operation.switched:
-                tensor_ref_A = getTensorRef(
-                    tensor_As[idx], problem_size, "a", transpose(self.operation.B.layout))
-                tensor_ref_B = getTensorRef(
-                    tensor_Bs[idx], problem_size, "b", transpose(self.operation.A.layout))
-                tensor_ref_C = getTensorRef(
-                    tensor_Cs[idx], problem_size, "c", transpose(self.operation.C.layout))
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_refs[idx], problem_size, "d", transpose(self.operation.C.layout))
-            else:
-                tensor_ref_A = getTensorRef(
-                    tensor_As[idx], problem_size, "a", self.operation.A.layout)
-                tensor_ref_B = getTensorRef(
-                    tensor_Bs[idx], problem_size, "b", self.operation.B.layout)
-                tensor_ref_C = getTensorRef(
-                    tensor_Cs[idx], problem_size, "c", self.operation.C.layout)
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
-
-            tensor_view_D_ref = getTensorView(
-                tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
-
-            cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
-                                        tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
-
-            tensor_view_D = getTensorView(
-                tensor_Ds[idx], problem_size, "d", self.operation.C.layout)
-
-            passed = cutlass.test.gemm.host.equals(
-                tensor_view_D, tensor_view_D_ref)
-
-            try:
-                assert passed
-            except AssertionError:
-                self.print_problem_size(problem_size)
-
-        del arguments
-
-        assert get_allocated_size(
-        ) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
-
-        return passed
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py
@ -1,594 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from time import sleep
-import pycutlass
-from pycutlass import *
-import pycutlass.utils.datatypes as datatypes
-import cutlass
-from cuda import cudart
-from cuda import cuda
-from bfloat16 import bfloat16
-from .profiler import GpuTimer
-import subprocess
-
-
-def transpose(layout):
-    if layout == cutlass.RowMajor:
-        return cutlass.ColumnMajor
-    elif layout == cutlass.ColumnMajor:
-        return cutlass.RowMajor
-    elif layout == cutlass.ColumnMajorInterleaved32:
-        return cutlass.RowMajorInterleaved32
-    elif layout == cutlass.RowMajorInterleaved32:
-        return cutlass.ColumnMajorInterleaved32
-
-
-def getTensorRef(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: cutlass.layout, batch_offset: int = 0):
-    ptr = tensor.__array_interface__['data'][0]
-    if operand == "a":
-        tensor_coord = problem_size.mk()
-        batch_stride = problem_size.m() * problem_size.k()
-    elif operand == "b":
-        tensor_coord = problem_size.kn()
-        batch_stride = problem_size.k() * problem_size.n()
-    elif operand in ["c", "d"]:
-        tensor_coord = problem_size.mn()
-        batch_stride = problem_size.m() * problem_size.n()
-    else:
-        raise ValueError("Unknown operand: " + operand)
-
-    elt_size = DataTypeSizeBytes[datatypes.to_cutlass(tensor.dtype)]
-    ptr += batch_offset * batch_stride * elt_size
-
-    if layout == cutlass.RowMajor:
-        layout = cutlass.RowMajor.packed(tensor_coord)
-        layout_tag = "RowMajor"
-    elif layout == cutlass.ColumnMajor:
-        layout = cutlass.ColumnMajor.packed(tensor_coord)
-        layout_tag = "ColumnMajor"
-    elif layout == cutlass.ColumnMajorInterleaved32:
-        layout = cutlass.ColumnMajorInterleaved32.packed(tensor_coord)
-        layout_tag = "ColumnMajorInterleaved32"
-    elif layout == cutlass.RowMajorInterleaved32:
-        layout = cutlass.RowMajorInterleaved32.packed(tensor_coord)
-        layout_tag = "RowMajorInterleaved32"
-    else:
-        raise ValueError("unsupported layout")
-    if tensor.dtype == np.float32:
-        ref_name = "TensorRefF32" + layout_tag
-    elif tensor.dtype == np.float64:
-        ref_name = "TensorRefF64" + layout_tag
-    elif tensor.dtype == np.float16:
-        ref_name = "TensorRefF16" + layout_tag
-    elif tensor.dtype == bfloat16:
-        ref_name = "TensorRefBF16" + layout_tag
-    elif tensor.dtype == np.int8:
-        ref_name = "TensorRefS8" + layout_tag
-    elif tensor.dtype == np.int32:
-        ref_name = "TensorRefS32" + layout_tag
-    else:
-        raise ValueError("unsupported datatype %s" %
-                         ShortDataTypeNames[tensor.dtype])
-
-    return getattr(cutlass, ref_name)(ptr, layout)
-
-
-def getTensorView(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: str, batch_offset: int = 0):
-    tensor_ref = getTensorRef(tensor, problem_size, operand, layout, batch_offset)
-
-    if operand == "a":
-        tensor_coord = problem_size.mk()
-    elif operand == "b":
-        tensor_coord = problem_size.kn()
-    elif operand in ["c", "d"]:
-        tensor_coord = problem_size.mn()
-    else:
-        raise ValueError("Unknown operand: " + operand)
-
-    if layout == cutlass.RowMajor:
-        layout_tag = "RowMajor"
-    elif layout == cutlass.ColumnMajor:
-        layout_tag = "ColumnMajor"
-    elif layout == cutlass.ColumnMajorInterleaved32:
-        layout_tag = "ColumnMajorInterleaved32"
-    elif layout == cutlass.RowMajorInterleaved32:
-        layout_tag = "RowMajorInterleaved32"
-    else:
-        raise ValueError("unsupported layout")
-    if tensor.dtype == np.float32:
-        ref_name = "TensorViewF32" + layout_tag
-    elif tensor.dtype == np.float64:
-        ref_name = "TensorViewF64" + layout_tag
-    elif tensor.dtype == np.float16:
-        ref_name = "TensorViewF16" + layout_tag
-    elif tensor.dtype == bfloat16:
-        ref_name = "TensorViewBF16" + layout_tag
-    elif tensor.dtype == np.int32:
-        ref_name = "TensorViewS32" + layout_tag
-    elif tensor.dtype == np.int8:
-        ref_name = "TensorViewS8" + layout_tag
-    else:
-        raise ValueError("unsupported datatype")
-
-    return getattr(cutlass, ref_name)(tensor_ref, tensor_coord)
-
-
-class GemmUniversalLauncher:
-    def __init__(self, operation: 'GemmOperationUniversal', seed: int = 2080, interleaved=False,
-                 verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
-        # create the reduction kernel
-        self.reduction_operation: ReductionOperation = ReductionOperation(
-            shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
-            C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
-            element_compute=operation.epilogue_functor.element_epilogue, epilogue_functor=operation.epilogue_functor,
-            count=operation.C.alignment
-        )
-
-        self.math_operation = operation.tile_description.math_instruction.math_operation
-
-        #: verify the output result
-        self.verification = verification
-        #: profile the kernel's runtime
-        self.profiling = profiling
-
-        self.timer = GpuTimer()
-
-        self.warmup_iterations = warmup_iterations
-        self.iterations = iterations
-
-        if "sleep" in kwargs.keys():
-            self.sleep_time = kwargs["sleep"]
-        else:
-            self.sleep_time = 0
-
-        #
-        # Compile the operator
-        #
-
-        op_list = [operation]
-        if operation.arch < 90:
-            # Split K via Python is currently only supported for pre-SM90 kernels
-            op_list.append(self.reduction_operation)
-
-        pycutlass.compiler.add_module(op_list)
-
-        self.operation = operation
-
-        self.dtype_A = GemmUniversalLauncher.numpy_type(operation.A.element)
-        self.dtype_B = GemmUniversalLauncher.numpy_type(operation.B.element)
-        self.dtype_C = GemmUniversalLauncher.numpy_type(operation.C.element)
-        self.dtype_D = GemmUniversalLauncher.numpy_type(operation.C.element)
-
-        accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
-        element_size = DataTypeSize[operation.A.element]
-
-        if element_size == 1:
-            self.scope_max = 1
-            self.scope_min = 0
-        elif element_size <= 8:
-            self.scope_max = 1
-            self.scope_min = -1
-        elif element_size == 16:
-            self.scope_max = 4
-            self.scope_min = -4
-        else:
-            self.scope_max = 8
-            self.scope_min = -8
-
-        #: seed
-        self.seed: int = seed
-
-        #: whether the layout is interleaved
-        self.interleaved = interleaved
-
-        #: compute type
-        self.compute_type = operation.epilogue_functor.element_epilogue
-        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
-
-    def print_problem_size(self, p, mode, batch_count):
-        if mode == cutlass.gemm.Mode.Gemm:
-            mode = "Gemm"
-        elif mode == cutlass.gemm.Mode.Batched:
-            mode = "GemmBatched"
-        elif mode == cutlass.gemm.Mode.GemmSplitKParallel:
-            mode = "GemmSplitKParallel"
-        problem_size = "problem: %d, %d, %d\n batch_count: %d\n mode: %s" % (
-            p.m(), p.n(), p.k(), batch_count, mode)
-        print(problem_size)
-
-    @staticmethod
-    def numpy_type(type):
-        if type == cutlass.float64:
-            return np.float64
-        elif type == cutlass.float32:
-            return np.float32
-        elif type == cutlass.float16:
-            return np.float16
-        elif type == cutlass.bfloat16:
-            return bfloat16
-        elif type == cutlass.int32:
-            return np.int32
-        elif type == cutlass.int8:
-            return np.int8
-        else:
-            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
-
-    def uniform_init(self, size, dtype):
-        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
-            return np.ceil(
-                np.random.uniform(
-                    low=self.scope_min - 0.5, high=self.scope_max - 0.5,
-                    size=size).astype(dtype)
-            )
-        else:
-            return np.random.uniform(
-                low=self.scope_min - 1, high=self.scope_max + 1,
-                size=size).astype(dtype)
-
-    def reorder_tensor_B(self, tensor_B, problem_size):
-        reordered_tensor_B = np.empty_like(tensor_B)
-        tensor_ref_B = getTensorRef(
-            tensor_B, problem_size, "b", self.operation.B.layout)
-        reordered_tensor_ref_B = getTensorRef(
-            reordered_tensor_B, problem_size, "b", self.operation.B.layout)
-        cutlass.gemm.host.reorder_column(
-            tensor_ref_B, reordered_tensor_ref_B, problem_size)
-        return reordered_tensor_B
-
-    def host_reference(self, problem_size, batch_count, tensor_A, tensor_B, tensor_C, alpha, beta):
-        tensor_D_ref = np.ones_like(tensor_C)
-        alpha = self.numpy_type(self.compute_type)(alpha)
-        beta = self.numpy_type(self.compute_type)(beta)
-        init_acc = 0
-
-        alpha = self.compute_type(alpha).value()
-        beta = self.compute_type(beta).value()
-        init_acc = self.accumulator_type(init_acc).value()
-
-        for i in range(batch_count):
-            if self.operation.switched:
-                tensor_ref_A = getTensorRef(
-                    tensor_A, problem_size, "a", transpose(self.operation.B.layout), batch_offset=i)
-                tensor_ref_B = getTensorRef(
-                    tensor_B, problem_size, "b", transpose(self.operation.A.layout), batch_offset=i)
-                tensor_ref_C = getTensorRef(
-                    tensor_C, problem_size, "c", transpose(self.operation.C.layout), batch_offset=i)
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_ref, problem_size, "d", transpose(self.operation.C.layout), batch_offset=i)
-            else:
-                tensor_ref_A = getTensorRef(
-                    tensor_A, problem_size, "a", self.operation.A.layout, batch_offset=i)
-                tensor_ref_B = getTensorRef(
-                    tensor_B, problem_size, "b", self.operation.B.layout, batch_offset=i)
-                tensor_ref_C = getTensorRef(
-                    tensor_C, problem_size, "c", self.operation.C.layout, batch_offset=i)
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_ref, problem_size, "d", self.operation.C.layout, batch_offset=i)
-
-            if self.math_operation in [MathOperation.multiply_add_saturate]:
-                cutlass.test.gemm.host.gemm_saturate(
-                    problem_size, alpha, tensor_ref_A, tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
-            else:
-                cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
-                                            tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
-
-        return tensor_D_ref
-
-    def equal(self, tensor_D, tensor_D_ref, problem_size, batch_count):
-        for i in range(batch_count):
-            tensor_view_D = getTensorView(
-                tensor_D, problem_size, "d", self.operation.C.layout, batch_offset=i)
-            tensor_view_D_ref = getTensorView(
-                tensor_D_ref, problem_size, "d", self.operation.C.layout, batch_offset=i)
-
-            if not cutlass.test.gemm.host.equals(tensor_view_D, tensor_view_D_ref):
-                return False
-
-        return True
-
-    def bytes(self, problem_size, batch_count=1, alpha=1.0, beta=0.0):
-        m = problem_size.m()
-        n = problem_size.n()
-        k = problem_size.k()
-
-        bytes = \
-            (DataTypeSize[self.operation.A.element] * m // 8) * k + \
-            (DataTypeSize[self.operation.B.element] * n // 8) * k + \
-            (DataTypeSize[self.operation.C.element] * m // 8) * n
-
-        if beta != 0:
-            bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
-
-        bytes *= batch_count
-
-        return bytes
-
-    def flops(self, problem_size, batch_count=1):
-        m = problem_size.m()
-        n = problem_size.n()
-        k = problem_size.k()
-
-        flops_ = (m * n * k) * 2 * batch_count
-
-        return flops_
-
-    def run_cutlass_profiler(self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0):
-
-        cutlass_path = os.getenv('CUTLASS_PATH')
-        assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
-
-        values = {
-            "profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
-            "kernel_name": self.operation.procedural_name(),
-            "verification_providers": "device",
-            "provider": "cutlass",
-            "m": str(problem_size.m()),
-            "n": str(problem_size.n()),
-            "k": str(problem_size.k()),
-            'split_k_slices': str(batch_count),
-            'alpha': str(alpha),
-            'beta': str(beta),
-            'warmup': str(self.warmup_iterations),
-            'profile': str(self.iterations)
-        }
-
-        cmd_template = \
-            "${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
-            " --providers=${provider} --m=${m} --n=${n} --k=${k}"
-
-        cmd = SubstituteTemplate(cmd_template, values)
-        result = subprocess.getoutput(cmd)
-
-        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
-        runtime = float(m.group('runtime'))
-
-        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
-        bytes = int(m.group('bytes'))
-
-        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
-        flops = int(m.group('flops'))
-
-        # check if the problem size matches
-        assert bytes == self.bytes(problem_size, alpha, beta)
-        assert flops == self.flops(problem_size)
-
-        return runtime
-
-    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
-        assert get_allocated_size(
-        ) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
-
-        np.random.seed(self.seed)
-
-        # Assign an actual batch count in cases where we are not running in batched mode.
-        # This is to differentiate between the number of split K slices and the batch count,
-        # which are overloaded within the single `batch_count` variable.
-        true_batch_count = batch_count if mode == cutlass.gemm.Mode.Batched else 1
-
-        tensor_A = self.uniform_init(
-            size=(problem_size.m() * problem_size.k() * true_batch_count,), dtype=self.dtype_A)
-        tensor_B = self.uniform_init(
-            size=(problem_size.n() * problem_size.k() * true_batch_count,), dtype=self.dtype_B)
-        tensor_C = self.uniform_init(
-            size=(problem_size.m() * problem_size.n() * true_batch_count,), dtype=self.dtype_C)
-        tensor_D = np.zeros(
-            shape=(problem_size.m() * problem_size.n() * true_batch_count,), dtype=self.dtype_D)
-
-        #
-        # Launch kernel
-        #
-
-        arguments = GemmArguments(
-            operation=self.operation, problem_size=problem_size,
-            A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
-            output_op=self.operation.epilogue_type(alpha, beta),
-            gemm_mode=mode, split_k_slices=split_k_slices, batch=batch_count
-        )
-
-        if mode == cutlass.gemm.Mode.GemmSplitKParallel:
-            reduction_arguments = ReductionArguments(
-                self.reduction_operation, problem_size=[
-                    problem_size.m(), problem_size.n()],
-                partitions=split_k_slices,
-                workspace=arguments.ptr_D,
-                destination=tensor_D,
-                source=tensor_C,
-                output_op=self.reduction_operation.epilogue_type(alpha, beta)
-            )
-
-        self.operation.run(arguments)
-
-        if mode == cutlass.gemm.Mode.GemmSplitKParallel:
-            self.reduction_operation.run(reduction_arguments)
-
-        passed = True
-
-        if self.verification:
-            if mode == cutlass.gemm.Mode.GemmSplitKParallel:
-                reduction_arguments.sync()
-            else:
-                arguments.sync()
-            tensor_D_ref = self.host_reference(
-                problem_size, true_batch_count, tensor_A, tensor_B, tensor_C, alpha, beta)
-            passed = self.equal(tensor_D, tensor_D_ref, problem_size, true_batch_count)
-
-            try:
-                assert passed
-            except AssertionError:
-                self.print_problem_size(problem_size, mode, batch_count)
-
-        if self.profiling:
-            sleep(self.sleep_time)
-            for _ in range(self.warmup_iterations):
-                self.operation.run(arguments)
-                if mode == cutlass.gemm.Mode.GemmSplitKParallel:
-                    self.reduction_operation.run(reduction_arguments)
-
-            self.timer.start()
-            for _ in range(self.iterations):
-                self.operation.run(arguments)
-                if mode == cutlass.gemm.Mode.GemmSplitKParallel:
-                    self.reduction_operation.run(reduction_arguments)
-            self.timer.stop_and_wait()
-
-            runtime = self.timer.duration(self.iterations)
-
-        # free memory and clear buffers
-        del arguments
-        if mode == cutlass.gemm.Mode.GemmSplitKParallel:
-            del reduction_arguments
-
-        assert get_allocated_size(
-        ) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
-
-        if self.profiling:
-            return runtime
-        return passed
-
-
-def test_all_gemm(operation: 'GemmOperationUniversal', testcase="universal"):
-
-    passed = True
-
-    minimum_operand_element_size = min(
-        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
-    opcode_class = operation.tile_description.math_instruction.opcode_class
-
-    if opcode_class == cutlass.OpClass.Simt:
-        alignment = 1
-    else:
-        alignment = 128 // minimum_operand_element_size
-
-    # int8_t gemm alignment constraints
-    if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 and operation.A.layout == cutlass.ColumnMajor:
-        alignment_m = 4
-    else:
-        alignment_m = alignment
-
-    if opcode_class == cutlass.OpClass.Simt and operation.B.element == cutlass.int8 and operation.A.layout == cutlass.RowMajor:
-        alignment_n = 4
-    else:
-        alignment_n = alignment
-
-    if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 \
-            and operation.B.element == cutlass.int8 \
-            and (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor):
-
-        alignment_k = 4
-    else:
-        alignment_k = alignment
-
-    threadblock_k = operation.tile_description.threadblock_shape[2]
-
-    if testcase == "interleaved":
-        if operation.A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
-            interleavedk = 32
-        else:
-            raise ValueError("Unknown layout")
-
-    if testcase == "interleaved":
-        modes = [cutlass.gemm.Mode.Gemm, ]
-        problem_size_m = [interleavedk, 512+interleavedk]
-        problem_size_n = [interleavedk, 512+interleavedk]
-        problem_size_k = [interleavedk, threadblock_k *
-                          operation.tile_description.stages + interleavedk]
-        problem_alpha = [1.0]
-        problem_beta = [0.0]
-        batch_counts = [1, ]
-    elif testcase == "multistage":
-        modes = [cutlass.gemm.Mode.Gemm, ]
-        problem_size_m = [16, 528]
-        problem_size_n = [16, 528]
-        problem_size_k = [threadblock_k, threadblock_k * operation.tile_description.stages +
-                          operation.tile_description.math_instruction.instruction_shape[2]]
-        problem_alpha = [1.0]
-        problem_beta = [0.0]
-        batch_counts = [1, ]
-    else:  # universal
-        modes = [cutlass.gemm.Mode.Gemm]
-        batch_counts = [1, 2, 3, 5, 7]
-        if operation.arch < 90:
-            # Split K kernels via Python are currently only supported pre-SM90
-            modes.append(cutlass.gemm.Mode.GemmSplitKParallel)
-
-        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
-        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
-        if operation.tile_description.stages is None:
-            stages_for_k_calc = 7
-        else:
-            stages_for_k_calc = operation.tile_description.stages
-        problem_size_k = [
-            alignment_k,
-            threadblock_k * stages_for_k_calc - alignment_k,
-            threadblock_k * stages_for_k_calc * 3 - alignment_k]
-        problem_alpha = [1.0]
-        problem_beta = [2.0]
-
-    testbed = GemmUniversalLauncher(
-        operation, interleaved=(testcase == "interleaved"))
-
-    for mode in modes:
-        for m in problem_size_m:
-            for n in problem_size_n:
-                for k in problem_size_k:
-                    for batch_count in batch_counts:
-                        for alpha in problem_alpha:
-                            for beta in problem_beta:
-                                # skip very small K problems
-                                if testcase == "universal":
-                                    if (k // batch_count < 2 * threadblock_k):
-                                        continue
-
-                                problem_size = cutlass.gemm.GemmCoord(m, n, k)
-
-                                if operation.arch < 90:
-                                    split_k_slices = batch_count
-                                else:
-                                    split_k_slices = 1
-
-                                overridden_mode = mode
-                                if mode == cutlass.gemm.Mode.Gemm and batch_count > 1:
-                                    overridden_mode = cutlass.gemm.Mode.Batched
-
-                                passed = testbed.run(
-                                    overridden_mode, problem_size, batch_count, split_k_slices, alpha, beta)
-
-                                err, = cudart.cudaDeviceSynchronize()
-                                if err != cuda.CUresult.CUDA_SUCCESS:
-                                    raise RuntimeError(
-                                        "CUDA Error %s" % str(err))
-
-                                if not passed:
-                                    return False
-
-    return passed
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py
@ -1,70 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from cuda import cuda
-from cuda import cudart
-
-
-class GpuTimer:
-    def __init__(self) -> None:
-        self.events = [
-            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
-            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
-        ]
-
-    def start(self, stream=cuda.CUstream(0)):
-        err, = cuda.cuEventRecord(self.events[0], stream)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
-
-    def stop(self, stream=cuda.CUstream(0)):
-        err, = cuda.cuEventRecord(self.events[1], stream)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
-        pass
-
-    def stop_and_wait(self, stream=cuda.CUstream(0)):
-        self.stop(stream)
-        if stream:
-            err, = cuda.cuStreamSynchronize(stream)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
-        else:
-            err, = cudart.cudaDeviceSynchronize()
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
-
-    def duration(self, iterations=1):
-        err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
-        return duration / float(iterations)
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/utils.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/utils.py
@ -1,109 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass
-from pycutlass import library, SubstituteTemplate
-
-
-class Layout:
-    """
-    Utility class to map transpose and non-transpose terminology to row- and column-major terminology
-    """
-    T = cutlass.RowMajor
-    N = cutlass.ColumnMajor
-
-
-class LayoutCombination:
-    """
-    Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
-    """
-    NNN = (Layout.N, Layout.N, Layout.N)
-    NNT = (Layout.N, Layout.N, Layout.T)
-    NTN = (Layout.N, Layout.T, Layout.N)
-    NTT = (Layout.N, Layout.T, Layout.T)
-    TNN = (Layout.T, Layout.N, Layout.N)
-    TNT = (Layout.T, Layout.N, Layout.T)
-    TTN = (Layout.T, Layout.T, Layout.N)
-    TTT = (Layout.T, Layout.T, Layout.T)
-
-
-def get_name(layouts, alignments, element_output,
-             element_accumulator, element_epilogue, cluster_shape,
-             threadblock_shape, stages, element_a, element_b, arch, opclass, suffix=""):
-    """
-    Generates a procedural name for a test case.
-
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param element_a: data type of operand A
-    :param element_b: data type of operand B
-    :param arch: compute capability of kernel being generated
-    :type arch: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param suffix: additional string to add to the suffix of the name
-    :type suffix: str
-
-    :return: str
-    """
-    name_format = 'test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${suffix}'
-    return SubstituteTemplate(name_format,
-        {
-            'arch': str(arch),
-            'eA': library.DataTypeNames[element_a],
-            'eB': library.DataTypeNames[element_b],
-            'eC': library.DataTypeNames[element_output],
-            'lA': library.ShortLayoutTypeNames[layouts[0]],
-            'lB': library.ShortLayoutTypeNames[layouts[1]],
-            'lC': library.ShortLayoutTypeNames[layouts[2]],
-            'opclass': library.OpcodeClassNames[opclass],
-            'acc': library.DataTypeNames[element_accumulator],
-            'cM': str(cluster_shape[0]),
-            'cN': str(cluster_shape[1]),
-            'cK': str(cluster_shape[2]),
-            'tbM': str(threadblock_shape[0]),
-            'tbN': str(threadblock_shape[1]),
-            'tbK': str(threadblock_shape[2]),
-            'stages': str(stages) if stages is not None else 'auto',
-            'aA' : str(alignments[0]),
-            'aB' : str(alignments[1]),
-            'aC' : str(alignments[2]),
-            'suffix': '' if suffix is None else suffix
-        }
-    )
--- a/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py
@ -1,39 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-
-from typing import Union
-from typeguard import typechecked
-
-
-GemmOperation = 'Union[GemmOperationUniversal, GemmOperationGrouped]'
-
-Tensor = 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]'
--- a/tools/library/scripts/pycutlass/src/pycutlass/utils/init.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/utils/init.py
@ -1 +0,0 @@
-from pycutlass.utils.reference_model import * 
--- a/tools/library/scripts/pycutlass/src/pycutlass/utils/datatypes.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/utils/datatypes.py
@ -1,121 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-"""
-Utility functions for converting between frontend datatypes and CUTLASS datatypes
-"""
-
-from typing import Union, Tuple
-
-import cutlass
-
-import pycutlass.library as library
-
-
-try:
-    import numpy as np
-    numpy_available = True
-except ImportError:
-    numpy_available = False
-
-def numpy_to_cutlass(inp):
-    if numpy_available:
-        if inp == np.float16:
-            return cutlass.float16
-        elif inp == np.float32:
-            return cutlass.float32
-        elif inp == np.float64:
-            return cutlass.float64
-        elif inp == np.int8:
-            return cutlass.int8
-        elif inp == np.int32:
-            return cutlass.int32
-    return None
-
-try:
-    import cupy as cp
-    cupy_available = True
-    cupy_to_cutlass_dict = {
-        cp.float16: cutlass.float16,
-        cp.float32: cutlass.float32,
-        cp.float64: cutlass.float64
-    }
-except ImportError:
-    cupy_available = False
-
-def cupy_to_cutlass(inp):
-    if cupy_available:
-        if inp == cp.float16:
-            return cutlass.float16
-        elif inp == cp.float32:
-            return cutlass.float32
-        elif inp == cp.float64:
-            return cutlass.float64
-    return None
-
-try:
-    import torch
-    torch_available = True
-    torch_to_cutlass_dict = {
-        torch.half:    cutlass.float16,
-        torch.float16: cutlass.float16,
-        torch.float:   cutlass.float32,
-        torch.float32: cutlass.float32,
-        torch.double:  cutlass.float64,
-        torch.float64: cutlass.float64
-    }
-except ImportError:
-    torch_available = False
-
-def torch_to_cutlass(inp):
-    if torch_available:
-        return torch_to_cutlass_dict.get(inp, None)
-
-try:
-    import bfloat16
-    bfloat16_available = True
-except ImportError:
-    bfloat16_available = False
-
-def bfloat16_to_cutlass(inp):
-    if bfloat16_available:
-        if inp == bfloat16.bfloat16:
-            return cutlass.bfloat16
-
-
-def to_cutlass(inp):
-    for cvt_fn in [bfloat16_to_cutlass, cupy_to_cutlass, numpy_to_cutlass, torch_to_cutlass]:
-        out = cvt_fn(inp)
-        if out is not None:
-            return out
-
-    raise Exception('No available conversion from type {} to a CUTLASS type.'.format(inp))
--- a/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py
@ -1,76 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-"""
-Utility functions for interacting with the device
-"""
-
-from cuda import cudart
-
-
-def check_cuda_errors(result: list):
-    """
-    Checks whether `result` contains a CUDA error raises the error as an exception, if so. Otherwise,
-    returns the result contained in the remaining fields of `result`.
-
-    :param result: the results of the `cudart` method, consisting of an error code and any method results
-    :type result: list
-
-    :return: non-error-code results from the `results` parameter
-    """
-    # `result` is of the format : (cudaError_t, result...)
-    err = result[0]
-    if err.value:
-        raise RuntimeError("CUDA error: {}".format(cudart.cudaGetErrorName(err)))
-
-    if len(result) == 1:
-        return None
-    elif len(result) == 2:
-        return result[1]
-    else:
-        return result[1:]
-
-
-def device_cc(device: int = 0) -> int:
-    """
-    Returns the compute capability of the device with ID `device`.
-
-    :param device: ID of the device to query
-    :type device: int
-
-    :return: compute capability of the queried device (e.g., 80 for SM80)
-    :rtype: int
-    """
-    deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
-    major = str(deviceProp.major)
-    minor = str(deviceProp.minor)
-    return int(major + minor)
--- a/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py
@ -1,255 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import numpy as np
-import cutlass
-from pycutlass.library import TensorDescription
-from typing import Union
-from bfloat16 import bfloat16
-try:
-    import torch
-    torch_available = True
-except ImportError:
-    torch_available = False
-
-class ReferenceModule:
-    def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription) -> None:
-        self.layout_A = A.layout
-        self.layout_B = B.layout
-        self.layout_C = C.layout
-    
-    def run(self, A: np.ndarray, B: np.ndarray, C: np.ndarray, problem_size: cutlass.gemm.GemmCoord, alpha: float=1.0, beta: float=0.0, bias=False, batch=1):
-        """
-        Compute the reference result on CPU
-        Args:
-            A: dense operator with shape (M, K) in row-major and (K, M) in column-major
-            B: dense operator with shape (K, N) in row-major and (N, K) in column-major
-            C: dense operator with shape (M, N) in row-major and (N, M) in column-major
-        """
-        M, N, K = problem_size.m(), problem_size.n(), problem_size.k()
-        if isinstance(A, np.ndarray):
-            if self.layout_A == cutlass.RowMajor:
-                A_row = np.reshape(A, newshape=(batch, M, K))
-            else:
-                A_col = np.reshape(A, newshape=(batch, K, M))
-                A_row = np.transpose(A_col, axes=(0, 2, 1))
-            
-            if self.layout_B == cutlass.RowMajor:
-                B_row = np.reshape(B, newshape=(batch, K, N))
-            else:
-                B_col = np.reshape(B, newshape=(batch, N, K))
-                B_row = np.transpose(B_col, axes=(0, 2, 1))
-
-            if self.layout_C == cutlass.RowMajor:
-                if bias:
-                    C_row = np.reshape(C, newshape=(batch, 1, N))
-                else:
-                    C_row = np.reshape(C, newshape=(batch, M, N))
-            else:
-                if bias:
-                    C_row = np.reshape(C, newshape=(batch, M, 1))
-                else:
-                    C_col = np.reshape(C, newshape=(batch, N, M))
-                    C_row = np.transpose(C_col, axes=(0, 2, 1))
-            
-            if A_row.dtype == bfloat16:
-                # numpy's einsum doesn't support bfloat16
-                out_row = np.einsum("bik,bkj->bij", A_row.astype(np.float32), B_row.astype(np.float32)) * alpha + C_row * beta
-                out_row = out_row.astype(C_row.dtype)
-            else:
-                out_row = np.einsum("bik,bkj->bij", A_row, B_row) * alpha + C_row * beta
-
-            if self.layout_C == cutlass.ColumnMajor:
-                out = np.transpose(out_row, axes=(0, 2, 1))
-            else:
-                out = out_row
-            
-            return out.ravel()
-
-        elif isinstance(A, torch.Tensor):
-            if self.layout_A == cutlass.RowMajor:
-                A_row = A.view((M, K))
-            else:
-                A_col = A.view((K, M))
-                A_row = torch.permute(A_col, (1, 0))
-            
-            if self.layout_B == cutlass.RowMajor:
-                B_row = B.view((K, N))
-            else:
-                B_col = B.view((N, K))
-                B_row = torch.permute(B_col, (1, 0))
-
-            if self.layout_C == cutlass.RowMajor:
-                C_row = C.view((M, N))
-            else:
-                C_col = C.view((N, M))
-                C_row = torch.permute(C_col, (1, 0))
-            
-            out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
-
-            if self.layout_C == cutlass.ColumnMajor:
-                out = torch.permute(out_row, (1, 0))
-            else:
-                out = out_row
-            
-            return torch.flatten(out)
-
-
-
-#####################################################################################################
-# Conv2d
-#####################################################################################################
-
-if torch_available:
-    class Conv2dReferenceModule:
-        def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription, kind: cutlass.conv.Operator.fprop) -> None:
-            self.layout_A = A.layout
-            self.layout_B = B.layout
-            self.layout_C = C.layout
-            self.kind = kind
-        
-        def run(self, 
-            A: Union[np.ndarray, torch.Tensor],
-            B: Union[np.ndarray, torch.Tensor],
-            C: Union[np.ndarray, torch.Tensor], problem_size, alpha=1.0, beta=0.0, bias=False) -> np.ndarray:
-            """
-            Compute the reference result on CPU
-            """
-            n = problem_size.N
-            h = problem_size.H
-            w = problem_size.W
-            c = problem_size.C
-
-            k = problem_size.K
-            r = problem_size.R
-            s = problem_size.S
-
-            p = problem_size.P
-            q = problem_size.Q
-
-            stride_h = problem_size.stride_h
-            stride_w = problem_size.stride_w
-
-            pad_h = problem_size.pad_h
-            pad_w = problem_size.pad_w
-
-            dilation_h = problem_size.dilation_h
-            dilation_w = problem_size.dilation_w
-
-            groups = problem_size.groups
-
-            if isinstance(A, np.ndarray):
-                # the pytorch activation layout is NCHW
-                #             weight layout is Cout Cin Kh Kw (also NCHW)
-                if self.layout_A == cutlass.TensorNHWC:
-                    A_nhwc = np.reshape(A, newshape=(n, h, w, c))
-                    A_torch_nhwc = torch.from_numpy(A_nhwc).to("cuda")
-                    A_torch_nchw = torch.permute(A_torch_nhwc, (0, 3, 1, 2))
-                
-                if self.layout_B == cutlass.TensorNHWC:
-                    B_nhwc = np.reshape(B, newshape=(k, r, s, c))
-                    B_torch_nhwc = torch.from_numpy(B_nhwc).to("cuda")
-                    B_torch_nchw = torch.permute(B_torch_nhwc, (0, 3, 1, 2))
-                
-                if self.layout_C == cutlass.TensorNHWC:
-                    C_nhwc = np.reshape(C, newshape=(n, p, q, k))
-                    C_torch_nhwc = torch.from_numpy(C_nhwc).to("cuda")
-                    C_torch_nchw = torch.permute(C_torch_nhwc, (0, 3, 1, 2))
-            
-            elif isinstance(A, torch.Tensor):
-                if self.kind == cutlass.conv.Operator.wgrad:
-                    if self.layout_A == cutlass.TensorNHWC:
-                        A_nhwc = A.view((n, p, q, k))
-                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
-                
-                    if self.layout_B == cutlass.TensorNHWC:
-                        B_nhwc = B.view((n, h, w, c))
-                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
-                    
-                    if self.layout_C == cutlass.TensorNHWC:
-                        if bias:
-                            C_nhwc = C.view((1, 1, 1, c))
-                        else:
-                            C_nhwc = C.view((k, r, s, c))
-                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
-                elif self.kind == cutlass.conv.Operator.dgrad:
-                    if self.layout_A == cutlass.TensorNHWC:
-                        A_nhwc = A.view((n, p, q, k))
-                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
-                    
-                    if self.layout_B == cutlass.TensorNHWC:
-                        B_nhwc = B.view((k, r, s, c))
-                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
-                    
-                    if self.layout_C == cutlass.TensorNHWC:
-                        if bias:
-                            C_nhwc = C.view((1, 1, 1, c))
-                        else:
-                            C_nhwc = C.view((n, h, w, c))
-                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
-                else:
-                    if self.layout_A == cutlass.TensorNHWC:
-                        A_nhwc = A.view((n, h, w, c))
-                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
-                    
-                    if self.layout_B == cutlass.TensorNHWC:
-                        B_nhwc = B.view((k, r, s, c))
-                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
-                    
-                    if self.layout_C == cutlass.TensorNHWC:
-                        if bias:
-                            C_nhwc = C.view((1, 1, 1, k))
-                        else:
-                            C_nhwc = C.view((n, p, q, k))
-                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
-
-            if self.kind == cutlass.conv.Operator.fprop:
-                D_torch_nchw = alpha * torch.nn.functional.conv2d(
-                    A_torch_nchw, B_torch_nchw, stride=(stride_h, stride_w),
-                    padding=(pad_h, pad_w), dilation=(dilation_h, dilation_w), groups=groups) + beta * C_torch_nchw
-            elif self.kind == cutlass.conv.Operator.dgrad:
-                D_torch_nchw = alpha * torch.nn.grad.conv2d_input(
-                    (n, c, h, w), B_torch_nchw, A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
-                ).to(torch.float32) + beta * C_torch_nchw
-            elif self.kind == cutlass.conv.Operator.wgrad:
-                D_torch_nchw = alpha * torch.nn.grad.conv2d_weight(
-                    B_torch_nchw, (k, c, r, s), A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
-                ).to(torch.float32) + beta * C_torch_nchw
-
-
-            if self.layout_C == cutlass.TensorNHWC:
-                if isinstance(A, np.ndarray):
-                    D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1)).detach().cpu().numpy()
-                elif isinstance(A, torch.Tensor):
-                    D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1))
-            
-            return D_torch_out.flatten()
--- a/tools/library/scripts/pycutlass/test/conv/init.py
+++ b/tools/library/scripts/pycutlass/test/conv/init.py
--- a/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
+++ b/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
@ -1,274 +0,0 @@
-conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
-conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
-conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
-conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
-conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
-conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
-conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
-conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
-conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
-conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
-conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
-conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
-conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
-conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
-conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
-conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
-conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
-conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
-conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
-conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
-conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
-conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
-conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
-conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
-conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
-conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 2624928614 3423533117 3186342135
-conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 2732296888 1838622641 4203745561
-conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3456572634 893492926 1966259884
-conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 4014726279 4027869577 1510990157
-conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 4140605332 3580988556 3425909428
-conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2106553169 835800311 3417471222
-conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 860217059 166776702 1109666471
-conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 855244826 2670006594 3857976152
-conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 3079461262 3579256638 2926210806
-conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2952423142 2045838875 3445165841
-conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 2133381336 2601441527 2035094220
-conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 1700915522 2515933441 406719240
-conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 156533442 1012781676 688128904
-conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
-conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
-conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
-conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
-conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
-conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
-conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
-conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
-conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
-conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
-conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
-conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
-conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
-conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 3612826298 2531545294 476754549
-conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 2391975923 197605094 3409942185
-conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3071904063 408984565 2378809888
-conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 3067676760 1540919649 2008865071
-conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 1085505037 2778215386 230227569
-conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2731079464 3570839563 3483629877
-conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 408419601 3415600242 2106927195
-conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 3606099389 4034802752 3200055633
-conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 3910244699 1319285699 2229775542
-conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 2780071616 2703730845 3090625734
-conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 4278696824 360883914 3802692600
-conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 653419877 359675571 283806385
-conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 1075980921 3101013494 2025203940
-conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
-conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
-conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
-conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
-conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
-conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
-conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
-conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
-conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
-conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
-conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
-conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
-conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
-conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
-conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
-conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
-conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
-conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
-conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
-conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
-conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
-conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
-conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
-conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
-conv2d fprop_1x8x8x1_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1883874274 1180207512 3934800419
-conv2d fprop_1x16x16x1_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 4230587034 4117433929 2540623821
-conv2d fprop_1x16x16x1_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 3802993432 1563447158 515257167
-conv2d fprop_1x224x224x1_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2583340103 3928463259 1564251818
-conv2d fprop_1x224x224x1_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2966178620 3457283045 1726663817
-conv2d fprop_1x224x224x1_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 3101289788 3492498648
-conv2d fprop_1x224x224x1_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 498358130 4111289929
-conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2693144988 3876248534 3038023830 1910263513
-conv2d fprop_1x16x16x2_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3355193355 319259163 535683577
-conv2d fprop_1x16x16x2_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 1548147432 3385829172 2741952709
-conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 2686562907 3948710179 3669872932
-conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 576815792 2317227037 1211532666
-conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 555460201 895685163
-conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 1465341652 2228916523
-conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 137535877 1436667267 1395660627
-conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 2226159049 4051661898 209529384
-conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 3541851870 2271016226 2671623385
-conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 2007343215 3362992769
-conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 20610297 1086800078
-conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3117444553 1497663382 3561001103
-conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 1414143072 827338392 2827855918
-conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 3886996022 26545788 3407771964
-conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 2374613655 3601677176
-conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 778374730 2110111988
-conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
-conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
-conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
-conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
-conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
-conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
-conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
-conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
-conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
-conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
-conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
-conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
-conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
-conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
-conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
-conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
-conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
-conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
-conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
-conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
-conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
-conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
-conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
-conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
-conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
-conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3254575292 1119957081 672831271
-conv2d fprop_1x4x4x14_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3115523958 3622905002 4020453928 3853387318
-conv2d fprop_1x23x56x98_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1702870033 1876930844 1190400523 3937287850
-conv2d fprop_1x4x4x28_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 2587856937 2021107274 2789519899
-conv2d fprop_1x23x56x100_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2368669977 1353376771 744357395 786349633
-conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 991402150 1393431534 2496492611 3901723984
-conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4208297221 4283492776 3148637036 258220505
-conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4178596783 3828059710 281106520 1103939403
-conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 924522595 563724475 1938163814 2197809394
-conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1021044158 1686067905 350851834 3999808950
-conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 2674994719 1034822169 1611033520
-conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 4201252830 1597212204 2181492560
-conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 70289262 3001492060 1379239000
-conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 1288095320 4211138051 2804617605
-conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 2202157489 1043108884 2923122465
-conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2476454437 1857118302 3877008798 1206012078
-conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2767650699 3514840131 2946529611 3907056932
-conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3896287283 3112762669 1581171257 3959460786
-conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1903067870 1021832870 1926804094 1756790353
-conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3489785028 2466126497 1712378956 434322965
-conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2051350923 263676708 355203300 821870356
-conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 719099834 1474713672 2886387159 4086314983
-conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3441724486 3162593831 1422796372 2049419539
-conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2034354027 1249407570 1196036582 2684312264
-conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 1060050551
-conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 3361618746
-conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 172579142 319546523 2332616929 543467298
-conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2823351660 1326352711 3839068434 65031397
-conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3238446487 2572503545 3604065639 2111204111
-conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2149247508 1775375365 2663631601 1249487679
-conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 403997062 1679063623 4062928786
-conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
-conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
-conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
-conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
-conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
-conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
-conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
-conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
-conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
-conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
-conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
-conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
-conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
-conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1092015789 3160693693 1526395881
-conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 2236679600 3168985259
-conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 3784328837 471971363
-conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
-conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
-conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
-conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
-conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
-conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
-conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
-conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
-conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
-conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
-conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
-conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
-conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
-conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
-conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
-conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
-conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
-conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
-conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
-conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
-conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
-conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
-conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
-conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
-conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
-conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 4106152802 2634710231 744755886
-conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 2709881923 2407415563
-conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 3723472741 3733128758 3129111191
-conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 2042513140 253288229 404121198
-conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1116254439 525487530 3284739065
-conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1743485155 91136873 2508716910
-conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 386662952 1127709182 4026285141
-conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 3954249564 2591894666 2655687700
-conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1263618595 1313664339
-conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1756414462 2995557277
-conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 447261065 121940906 1497499264
-conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 2966693627 1423016429 341928547
-conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1759979610 2761559427 68093525
-conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 2980501720 1650970502 3258883197
-conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 3502822733 3985958544 2568949300
-conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 3289288595 385631111 328914986
-conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 3391080565 1513955316 1521294163
-conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1669352457 2608107448 4284090805
-conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 1126870455 106232038 3054809396
-conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 4239438967
-conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 2113601884
-conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 2413490039 36034283 1112346965
-conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 1601750164 14375779 2894970748
-conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 1300976652 4259930640 305685205
-conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 1747587481 4137156526 1174257270
-conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1086820986 1644914756 2013471312
-conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
-conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
-conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
-conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
-conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
-conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
-conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
-conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
-conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
-conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
-conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
-conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
-conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
-conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
-conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
-conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
-conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
-conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
-conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
-conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
-conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
-conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
-conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
-conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
-conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
-conv2d wgrad_1x8x8x1_8x8_1x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 4278264698 2331753571 2554564568
-conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
-conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
-conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
-conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
-conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
-conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
-conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
-conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
-conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
-conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
-conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
-conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
-conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -1,233 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-from pycutlass.conv2d_operation import *
-from pycutlass import *
-from pycutlass.test import *
-from pycutlass.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 12),
-                cutlass.Tensor4DCoord(8, 3, 3, 12),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 12),
-                cutlass.Tensor4DCoord(8, 3, 3, 12),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,209 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import pycutlass
-from pycutlass import *
-from pycutlass.test import *
-from pycutlass.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float32,
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float32,
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=4, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float32,
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float32, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=4, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -1,130 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-import pycutlass
-from pycutlass.conv2d_operation import *
-from pycutlass import *
-from pycutlass.test import *
-from pycutlass.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass.float32, element_b=cutlass.float32, 
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float32, 
-            layout=cutlass.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass.float32, element_b=cutlass.float32,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float32,
-            layout=cutlass.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-
-
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,127 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import pycutlass
-from pycutlass import *
-from pycutlass.test import *
-from pycutlass.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass.float32, element_b=cutlass.float32,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float32,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass.float32, element_b=cutlass.float32,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float32,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -1,195 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
-import pycutlass
-from pycutlass.test import *
-from pycutlass.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-def conv2d_few_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 8, 8, channels),
-            cutlass.Tensor4DCoord(16, 3, 3, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 16, 16, channels),
-            cutlass.Tensor4DCoord(16, 3, 3, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 16, 16, channels),
-            cutlass.Tensor4DCoord(16, 7, 7, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(32, 7, 7, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(64, 7, 7, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(64, 5, 5, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(64, 5, 5, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=1)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=1)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=2, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
-
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -1,219 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
-import pycutlass
-from pycutlass.test import *
-from pycutlass.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-def conv2d_fixed_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 8, 8, channels),
-            cutlass.Tensor4DCoord(16, 3, 3, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(32, 7, 7, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(64, 7, 7, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(64, 5, 5, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass.conv.Conv2dProblemSize(
-            cutlass.Tensor4DCoord(1, 224, 224, channels),
-            cutlass.Tensor4DCoord(64, 5, 5, channels),
-            cutlass.Tensor4DCoord(1, 1, 1, 1),
-            cutlass.MatrixCoord(2, 2),
-            cutlass.MatrixCoord(1, 1),
-            cutlass.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
-
-
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -1,341 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-import pycutlass
-from pycutlass import *
-from pycutlass.test import *
-from pycutlass.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-
-        problem_sizes = [
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 12),
-                cutlass.Tensor4DCoord(8, 3, 3, 12),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 14),
-                cutlass.Tensor4DCoord(8, 3, 3, 14),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 23, 56, 98),
-                cutlass.Tensor4DCoord(128, 3, 3, 98),
-                cutlass.Tensor4DCoord(4, 0, 5, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 12),
-                cutlass.Tensor4DCoord(8, 3, 3, 12),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 14),
-                cutlass.Tensor4DCoord(8, 3, 3, 14),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 23, 56, 98),
-                cutlass.Tensor4DCoord(128, 3, 3, 98),
-                cutlass.Tensor4DCoord(4, 0, 5, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass.float16, element_b=cutlass.float16,
-            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass.float16,
-            layout=cutlass.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 12),
-                cutlass.Tensor4DCoord(8, 3, 3, 12),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 4, 4, 28),
-                cutlass.Tensor4DCoord(8, 3, 3, 28),
-                cutlass.Tensor4DCoord(0, 0, 0, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass.conv.Conv2dProblemSize(
-                cutlass.Tensor4DCoord(1, 23, 56, 100),
-                cutlass.Tensor4DCoord(128, 3, 3, 100),
-                cutlass.Tensor4DCoord(4, 0, 5, 0),
-                cutlass.MatrixCoord(3, 3),
-                cutlass.MatrixCoord(1, 1),
-                cutlass.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-
-if __name__ == '__main__':
-    pycutlass.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`from pycutlass.utils.reference_model import *`