CUTLASS 2.10 (#615)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
2022-09-03 15:48:46 -07:00
parent ca23ff7924
commit b72cbf957d
289 changed files with 43708 additions and 2513 deletions
--- a/tools/library/scripts/gemm_operation.py
+++ b/tools/library/scripts/gemm_operation.py
@ -149,6 +149,35 @@ class GemmOperation:
    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
    return self.procedural_name()

+
+###################################################################################################
+#
+# Data structure modeling a grouped GEMM operation
+#
+###################################################################################################
+
+#
+class GroupedGemmOperation(GemmOperation):
+  #
+  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      scheduler_mode = GroupScheduleMode.Device):
+    super().__init__(gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+                     epilogue_functor, swizzling_functor)
+
+    self.scheduler_mode = scheduler_mode
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    base = super().procedural_name()
+    return SubstituteTemplate(
+      base + "_schedule${schedule}",
+      {
+        'schedule': ShortGroupScheduleModeNames[self.scheduler_mode]
+      })
+
+
 ###################################################################################################
 #
 # Emits single instances of a CUTLASS device-wide operator
@ -738,6 +767,7 @@ using ${operation_name}_base =
    ${epilogue_functor},
    ${swizzling_functor},
    ${stages},
+    ${scheduler_mode},
    ${math_operation}
 >::GemmKernel;

@ -817,6 +847,7 @@ ${compile_guard_end}
      'align_b': str(operation.B.alignment),
      'transform_a': ComplexTransformTag[operation.A.complex_transform],
      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'scheduler_mode': GroupScheduleModeTag[operation.scheduler_mode],
      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
    }

--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@ -180,7 +180,7 @@ def CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, \
            B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
            C = TensorDescription(element_c, layout[2], alignment_c)

-            new_operation = GemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
+            new_operation = GroupedGemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)

            manifest.append(new_operation)
@ -346,7 +346,7 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
  # iterator algorithm (analytic and optimized)
  #iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
  iterator_algorithms = [IteratorAlgorithm.Optimized]
-  
+
  # by default, only generate the largest tile size, largest alignment, and optimized iterator
  if manifest.kernel_filter == '':
    tile_descriptions = [tile_descriptions[0],]
@ -527,7 +527,7 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
  alignment_c = min(8, alignment)
  
  # iterator algorithm (analytic and optimized)
-  #iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
+#  iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
  iterator_algorithms = [IteratorAlgorithm.Optimized]

  # by default, only generate the largest tile size and optimized iterators
@ -1677,7 +1677,6 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [8, 4, 2]

@ -1694,12 +1693,14 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
      TileDescription([128,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
      TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
@ -1773,23 +1774,22 @@ def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [8]

  for math_inst in math_instructions:
    tile_descriptions = [
      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
      TileDescription([128, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([256,  64,  64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
      TileDescription([128,  64,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
    ]
@ -1917,7 +1917,7 @@ def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80
+  smem_usage = 164

  alignment_constraints = [16,]

@ -1931,10 +1931,10 @@ def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
      TileDescription([128,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64,  64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -1986,22 +1986,21 @@ def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [16,]

  tile_descriptions = [
    TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
    TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-    TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
    TileDescription([ 64,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
  ]

@ -2102,8 +2101,6 @@ def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80
-
  alignment_constraints = [32,]

  for math_inst in math_instructions:
@ -2116,11 +2113,11 @@ def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
      TileDescription([128,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 256],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 256],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2173,21 +2170,19 @@ def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80
-
  alignment_constraints = [32,]

  tile_descriptions = [
    TileDescription([ 64,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
    TileDescription([256,  64, 256],  3, [4, 1, 1], math_inst, min_cc, max_cc),
-    TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([ 64, 128, 256],  6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128, 128, 512],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-    TileDescription([128,  64, 512],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+    TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 256],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 512],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 512],  4, [2, 2, 1], math_inst, min_cc, max_cc),
    TileDescription([ 64, 128, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
    TileDescription([ 64,  64, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
  ]
@ -2338,7 +2333,6 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [4, 2, 1]

@ -2354,11 +2348,11 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):
      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([64,  128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2424,7 +2418,6 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [4, 2, 1]

@ -2440,11 +2433,11 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2483,7 +2476,6 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [4, 2, 1]

@ -2497,8 +2489,8 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
      TileDescription([ 64, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([128, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([256,  64, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([ 64, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([256,  64, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@ -2583,23 +2575,22 @@ def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [4]

  for math_inst in math_instructions:
    tile_descriptions = [
      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
      TileDescription([256,  64, 32],  3, [4, 1, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
-      TileDescription([128, 128, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-      TileDescription([128,  64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited),
+      TileDescription([128, 128, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
      TileDescription([ 64,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
    ]
@ -3047,7 +3038,6 @@ def GenerateSM80_TensorOp_884(manifest, cuda_version):

  min_cc = 80
  max_cc = 1024
-  max_cc_smem_limited = 80

  alignment_constraints = [1,]

--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@ -456,7 +456,7 @@ OperationKindNames = {
 # 
 class Target(enum.Enum):
  library = enum_auto()
-
+#
 ArchitectureNames = {
  50: 'maxwell',
  60: 'pascal',
@ -466,6 +466,16 @@ ArchitectureNames = {
  80: 'ampere',
 }

+#
+SharedMemPerCC = {
+  70: 96,  #  96KB of SMEM
+  72: 96,  #  96KB of SMEM
+  75: 64,  #  64KB of SMEM
+  80: 160, # 164KB of SMEM - 4KB reserved for the driver
+  86: 100, # 100KB of SMEM
+  87: 160, # 164KB of SMEM - 4KB reserved for the driver
+}
+
 ###################################################################################################

 #
@ -564,6 +574,23 @@ SwizzlingFunctorTag = {
  SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
 }

+#
+class GroupScheduleMode(enum.Enum):
+  Device = enum_auto(),
+  Host = enum_auto()
+
+#
+GroupScheduleModeTag = {
+  GroupScheduleMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
+  GroupScheduleMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
+}
+
+#
+ShortGroupScheduleModeNames = {
+  GroupScheduleMode.Device: 'Device',
+  GroupScheduleMode.Host: 'Host'
+}
+
 ###################################################################################################

 #
@ -636,7 +663,6 @@ class MathInstruction:
    self.opcode_class = opcode_class
    self.math_operation = math_operation

-
 #
 class TileDescription:

@ -681,3 +707,29 @@ class TriangularTensorDescription:
    self.complex_transform = complex_transform

 ###################################################################################################
+
+#
+def CalculateSmemUsage(operation):
+  cta_shape = operation.tile_description.threadblock_shape
+  stages = operation.tile_description.stages
+
+  if operation.operation_kind == OperationKind.Gemm and operation.gemm_kind == GemmKind.Sparse:
+    # Elements represented by 8 bits of metadata (based on 4:8, 2:4 or 1:2 sparsity)
+    if DataTypeSize[operation.A.element] == 32:
+      elements_per_8b_md = 2
+    elif DataTypeSize[operation.A.element] == 4:
+      elements_per_8b_md = 8
+    else:
+      elements_per_8b_md = 4
+
+    smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * (cta_shape[2] // 2) // 8 + \
+                     DataTypeSize[operation.B.element] * cta_shape[1] * cta_shape[2] // 8 + \
+                     cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
+  else:
+    # Few BLAS3 operations only have A tensor
+    smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * cta_shape[2] // 8 + \
+                     DataTypeSize[operation.A.element] * cta_shape[1] * cta_shape[2] // 8
+
+  smem_usage = smem_per_stage * stages
+  return (smem_usage >> 10)
+###################################################################################################
--- a/tools/library/scripts/manifest.py
+++ b/tools/library/scripts/manifest.py
@ -276,7 +276,8 @@ class Manifest:

    for cc in self.compute_capabilities:
      if cc >= operation.tile_description.minimum_compute_capability and \
-        cc <= operation.tile_description.maximum_compute_capability:
+         cc <= operation.tile_description.maximum_compute_capability and \
+         (cc not in SharedMemPerCC or SharedMemPerCC[cc] >= CalculateSmemUsage(operation)):

        enabled = True
        break
--- a/tools/library/scripts/pycutlass/README.md
+++ b/tools/library/scripts/pycutlass/README.md
@ -0,0 +1,120 @@
+# PyCUTLASS: CUTLASS Python Interface
+
+PyCUTLASS is a python interface of CUTLASS C++ template library. PyCUTLASS takes user-defined operation descriptions, emits C++ code, and compiles it with `nvcc` or `nvrtc`. It also provides wrappers for user-provide arguments from [numpy](https://numpy.org/), [torch](https://pytorch.org/), and [cupy](https://github.com/cupy/cupy) and encode them to kernel's parameters.
+
+```python
+import pycutlass
+from pycutlass import *
+import torch
+
+pycutlass.get_memory_pool(2**8, 2**32)
+
+math_inst = MathInstruction(
+    [1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
+    cutlass.OpClass.Simt, MathOperation.multiply_add
+)
+
+tile_description = TileDescription(
+    [128, 128, 8], 4, [2, 4, 1],
+    math_inst, 80, 80
+)
+
+A = TensorDescription(
+    cutlass.float32, cutlass.RowMajor, 1
+)
+
+B = TensorDescription(
+    cutlass.float32, cutlass.RowMajor, 1
+)
+
+C = TensorDescription(
+    cutlass.float32, cutlass.RowMajor, 1
+)
+
+operation = GemmOperationUniversal(
+    arch=80, tile_description=tile_description,
+    A=A, B=B, C=C, element_epilogue=cutlass.float32,
+    epilogue_functor=EpilogueFunctor.LinearCombination, 
+    swizzling_functor=cutlass.IdentitySwizzle1
+)
+
+pycutlass.compiler.add_module([operation,])
+
+problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
+
+tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+tensor_D = torch.empty_like(tensor_C)
+
+
+alpha = 1.0
+beta = 0.0
+
+arguments = GemmArguments(
+    operation=operation, problem_size=problem_size,
+    A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+    output_op=LinearCombinationFunctorArguments(alpha, beta),
+    gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
+)
+
+operation.run(arguments)
+
+arguments.sync()
+
+tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
+
+assert torch.equal(tensor_D, tensor_D_ref)
+```
+PyCUTLASS also provides infrastructures for profiling, compiled artifact management, and pool memory manager 
+
+## Installation
+
+### Using Docker
+You can run the PyCUTLASS on NGC pytorch container. 
+```shell
+docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:22.08-py3
+```
+PyCUTLASS requires additional dependency Boost C++ library, which can be installed with
+```bash
+apt-get update
+apt-get -y install libboost-all-dev
+```
+
+
+
+### Environment variables
+PyCUTLASSS requires two environment variables:
+* `CUTLASS_PATH`: the root directory of CUTLASS
+* `CUDA_INSTALL_PATH`: the directory where cuda toolkit is installed
+
+After setting these two environment variables, PyCUTLASS can be installed with 
+```shell
+cd $CUTLASS_PATH/tools/library/scripts/pycutlass && bash build.sh
+```
+
+## Examples
+Examples can be found in `$CUTLASS_PATH/examples/40_cutlass_py`
+
+## Test
+The test cases are listed in `$CUTLASS_PATH//tools/library/scripts/pycutlass/test`. The unit test can be run with
+```shell
+cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/unit && python test_sm80.py
+```
+
+
+## Troubleshooting
+
+### Issue 1: permission denied
+Building PyCUTLASS requires installing dependencies to python. So conda could an option if you don't have permission.
+
+### Issue 2: rmm: module not found
+PyCUTLASS manages the device memory with [RMM](https://github.com/rapidsai/rmm). Our `build.sh` automatically pull the [rmm branch-22.08](https://github.com/rapidsai/rmm/tree/branch-22.08) from github and build it from source. The rmm is allocated at `$CUTLASS_PATH/tools/library/scripts/pycutlass/rmm`. It requires `cmake > 3.20.1`. If the build fails, it can be manually fixed with the following steps:
+```shell
+cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm && ./build.sh librmm rmm
+
+cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm/python
+python setup.py build_ext --inplace
+python setup.py install
+```
+To test whether rmm is successfully installed, try `import rmm`. For other issues related to rmm, please check https://github.com/rapidsai/rmm/issues. 
--- a/tools/library/scripts/pycutlass/build.sh
+++ b/tools/library/scripts/pycutlass/build.sh
@ -0,0 +1,4 @@
+pip install pybind11
+git clone https://github.com/google/googletest.git
+python setup.py install
+python setup.py rmm
--- a/tools/library/scripts/pycutlass/build_doc.sh
+++ b/tools/library/scripts/pycutlass/build_doc.sh
@ -0,0 +1,2 @@
+python setup.py develop
+sphinx-build -b html docs/source/ docs/build/html
--- a/tools/library/scripts/pycutlass/docs/Makefile
+++ b/tools/library/scripts/pycutlass/docs/Makefile
@ -0,0 +1,52 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/tools/library/scripts/pycutlass/docs/make.bat
+++ b/tools/library/scripts/pycutlass/docs/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/tools/library/scripts/pycutlass/docs/source/conf.py
+++ b/tools/library/scripts/pycutlass/docs/source/conf.py
@ -0,0 +1,93 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'PyCutlass'
+copyright = '2022, Andrew Kerr; Zhaodong Chen; Haicheng Wu; Szymon Migacz; Graham Markall'
+author = 'Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.duration',
+    'sphinx.ext.doctest',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'enum_tools.autoenum',
+    'sphinx.ext.autosummary'
+]
+
+autosummary_generate = True
+autosummary_imported_members = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'classic'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
--- a/tools/library/scripts/pycutlass/docs/source/conv2d_op.rst
+++ b/tools/library/scripts/pycutlass/docs/source/conv2d_op.rst
@ -0,0 +1,13 @@
+CONV2D Operation
+================
+
+.. autoclass:: pycutlass.Conv2dOperation
+    :special-members:
+    :members: run
+    :exclude-members: __weakref__, configuration_name, core_name, extended_name, procedural_name
+
+.. autoclass:: pycutlass.Conv2dArguments
+    :special-members:
+    :members:
+    :exclude-members: initialize
+    :show-inheritance:
--- a/tools/library/scripts/pycutlass/docs/source/cutlass.rst
+++ b/tools/library/scripts/pycutlass/docs/source/cutlass.rst
@ -0,0 +1,2 @@
+cutlass
+=======
--- a/tools/library/scripts/pycutlass/docs/source/descriptor.rst
+++ b/tools/library/scripts/pycutlass/docs/source/descriptor.rst
@ -0,0 +1,6 @@
+Descriptions
+==============
+
+.. autoclass:: pycutlass.TileDescription
+    :special-members: 
+    :members:
--- a/tools/library/scripts/pycutlass/docs/source/frontend.rst
+++ b/tools/library/scripts/pycutlass/docs/source/frontend.rst
@ -0,0 +1,5 @@
+Frontend
+==============
+
+.. autoclass:: pycutlass.NumpyFrontend
+    :members:
--- a/tools/library/scripts/pycutlass/docs/source/gemm_op.rst
+++ b/tools/library/scripts/pycutlass/docs/source/gemm_op.rst
@ -0,0 +1,18 @@
+GEMM Operation
+==============
+
+.. autoclass:: pycutlass.GemmOperationUniversal
+    :special-members: 
+    :members:
+
+.. autoclass:: pycutlass.GemmOperationGrouped
+    :special-members:
+    :members:
+
+.. autoclass:: pycutlass.GemmArguments
+    :special-members:
+    :members:
+
+.. autoclass:: pycutlass.GemmGroupedArguments
+    :special-members:
+    :members:
--- a/tools/library/scripts/pycutlass/docs/source/index.rst
+++ b/tools/library/scripts/pycutlass/docs/source/index.rst
@ -0,0 +1,29 @@
+.. PyCutlass documentation master file, created by
+   sphinx-quickstart on Sun Jun 19 12:05:42 2022.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PyCutlass's documentation!
+=====================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
+
+.. toctree::
+   types
+   cutlass
+   descriptor
+   frontend
+   gemm_op
+   conv2d_op
--- a/tools/library/scripts/pycutlass/docs/source/types.rst
+++ b/tools/library/scripts/pycutlass/docs/source/types.rst
@ -0,0 +1,6 @@
+Types
+========
+
+
+.. autoenum:: pycutlass.OperationKind
+    :members:
--- a/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py
@ -0,0 +1,104 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from pycutlass import *
+import pycutlass
+from pycutlass.test.conv2d_testbed import Conv2dLauncher
+
+
+if __name__ == "__main__":
+    pycutlass.get_memory_pool(2**33, 2**33)
+    pycutlass.compiler.nvcc()
+
+    math_inst = MathInstruction(
+        instruction_shape=[16, 8, 16],
+        element_a=cutlass.float16, element_b=cutlass.float16,
+        element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+        math_operation=MathOperation.multiply_add
+    )
+
+    A = TensorDescription(
+        element=math_inst.element_a, 
+        layout=cutlass.TensorNHWC,
+        alignment=8)
+    B = TensorDescription(
+        element=math_inst.element_b, 
+        layout=cutlass.TensorNHWC, 
+        alignment=8)
+    C = TensorDescription(
+        element=cutlass.float32,
+        layout=cutlass.TensorNHWC, 
+        alignment=8)
+
+    tile_description = TileDescription(
+        threadblock_shape=[128, 128, 64], stages=4, 
+        warp_count=[2, 2, 1],
+        math_instruction=math_inst,
+        min_compute=80, max_compute=80
+    )
+
+    operation = Conv2dOperation(
+        conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+        arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+        element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+        epilogue_functor=EpilogueFunctor.LinearCombination,
+        swizzling_functor=cutlass.IdentitySwizzle1
+    )
+
+    profiler = Conv2dLauncher(operation, verification=False, profiling=True)
+
+    python_runtime = profiler.run(
+        problem_size = cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(32, 224, 224, 128),
+            cutlass.Tensor4DCoord(128, 3, 3, 128),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ), split_k_mode=cutlass.conv.SplitKMode.Serial
+    )
+
+
+    cpp_runtime = profiler.run_cutlass_profiler(
+        problem_size = cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(32, 224, 224, 128),
+            cutlass.Tensor4DCoord(128, 3, 3, 128),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ), split_k_mode=cutlass.conv.SplitKMode.Serial
+    )
+
+    print(cpp_runtime / python_runtime)
--- a/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py
@ -0,0 +1,91 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+from pycutlass.test.gemm_testbed import GemmUniversalLauncher
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**32, 2**32)
+    pycutlass.compiler.nvcc()
+
+    math_inst = MathInstruction(
+        instruction_shape=[16, 8, 16],
+        element_a=cutlass.float16, element_b=cutlass.float16,
+        element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+        math_operation=MathOperation.multiply_add
+    )
+
+    tile_description = TileDescription(
+        threadblock_shape=[256, 128, 32],
+        stages=3, warp_count=[4, 2, 1],
+        math_instruction=math_inst, min_compute=80, max_compute=80
+    )
+
+    A = TensorDescription(
+        element=cutlass.float16, layout=cutlass.RowMajor,
+        alignment=4
+    )
+    B = TensorDescription(
+        element=cutlass.float16, layout=cutlass.RowMajor,
+        alignment=4
+    )
+    C = TensorDescription(
+        element=cutlass.float32, layout=cutlass.ColumnMajor,
+        alignment=4
+    )
+
+    element_epilogue = cutlass.float32
+
+    epilogue_functor = EpilogueFunctor.LinearCombination
+    
+    swizzling_functor = cutlass.IdentitySwizzle1
+
+    operation = GemmOperationUniversal(
+        arch=80, tile_description=tile_description,
+        A=A, B=B, C=C, element_epilogue=element_epilogue,
+        epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+    )
+
+    profiler = GemmUniversalLauncher(operation, verification=False, profiling=True)
+    python_runtime = profiler.run(
+        mode=cutlass.gemm.Mode.Gemm, 
+        problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096)
+    )
+
+    cpp_runtime = profiler.run_cutlass_profiler(
+        mode=cutlass.gemm.Mode.Gemm,
+        problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096),
+    )
+
+    print(cpp_runtime / python_runtime)
--- a/tools/library/scripts/pycutlass/pyproject.toml
+++ b/tools/library/scripts/pycutlass/pyproject.toml
@ -0,0 +1,9 @@
+[build-system]
+
+requires = [
+    "setuptools",
+    "scikit-build>0.13.1",
+    "pybind11",
+    "numpy<1.23",
+    "cmake>=3.20.1,!=3.23.0"
+]
--- a/tools/library/scripts/pycutlass/setup.py
+++ b/tools/library/scripts/pycutlass/setup.py
@ -0,0 +1,79 @@
+import distutils.cmd
+from setuptools import setup
+import setuptools.command.build_py
+import os
+
+# build rmm dependency
+class BuildRMM(distutils.cmd.Command):
+    user_options = []
+    def initialize_options(self):
+        pass
+    def finalize_options(self):
+        pass
+    def run(self):
+        try:
+            import rmm
+        except ImportError:
+            print("installing rmm")
+            os.system("git clone -b branch-22.08 --recurse-submodules https://github.com/rapidsai/rmm.git")
+            os.chdir("./rmm")
+            os.system("./build.sh librmm rmm")
+            os.chdir("./python")
+            os.system("python setup.py build_ext --inplace")
+            os.system("python setup.py install")
+
+cutlass_path = os.getenv('CUTLASS_PATH')
+assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
+cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
+assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
+
+ext_modules = []
+
+try:
+    from pybind11.setup_helpers import Pybind11Extension, build_ext
+    include_dirs = [
+        cutlass_path + "/include",
+        cuda_install_path + "/include",
+        cutlass_path + "/tools/util/include",
+        cutlass_path + "/test",
+        cutlass_path + "/tools/library/scripts/pycutlass/googletest/googletest/include"
+    ]
+
+    ext_modules = [
+        Pybind11Extension("cutlass",
+                          ["src/cpp/cutlass.cpp"],
+                          include_dirs=include_dirs,
+                          extra_compile_args=["-fpermissive"])
+    ]
+except ImportError:
+    pass
+
+setup(
+    name="PyCutlass",
+    version="0.0.1",
+    author="Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall",
+    author_email="zhaodongc@nvidia.com",
+    description="Python interface for CUTLASS",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    package_dir={"": "src"},
+    packages=['pycutlass', 'pycutlass.utils', 'pycutlass.test'],
+    setup_requires=["pybind11", "numpy<1.23"],
+    install_requires=[
+        "numpy<1.23",
+        'pybind11',
+        'cuda-python<11.7.0',
+        'typeguard',
+        'bfloat16',
+        'typing',
+        'scikit-build'
+    ],
+    cmdclass={
+        'rmm': BuildRMM
+    },
+    ext_modules=ext_modules,
+    python_requires=">=3.6",
+)
--- a/tools/library/scripts/pycutlass/src/cpp/compiler.h
+++ b/tools/library/scripts/pycutlass/src/cpp/compiler.h
@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief In-memory compiled artifact cache
+*/
+
+#include <pybind11/pybind11.h>
+#include <string>
+#include <unordered_map>
+
+
+namespace py = pybind11;
+
+namespace cutlass {
+
+struct CompileCache {
+public:
+    CompileCache() = default;
+    ~CompileCache() = default;
+
+    using Cache = std::unordered_map<std::string, py::object>;
+
+    /// Check if the kernel has already been compiled
+    py::object at(const std::string &kernel) {
+        auto item = cache_.find(kernel);
+
+        if (item != cache_.end()) {
+            return item->second;
+        }
+        return py::none();
+    }
+
+    /// Insert a new compiled kernel for new configuration
+    void insert(const std::string &kernel, const py::object &compiled_kernel){
+        cache_.emplace(kernel, compiled_kernel);
+    }
+
+    const int64_t size() const { return cache_.size(); }
+
+    /// Clear the cache
+    void clear() { cache_.clear(); }
+
+private:
+    Cache cache_;
+};
+
+} // namespace cutlass
--- a/tools/library/scripts/pycutlass/src/cpp/cutlass.cpp
+++ b/tools/library/scripts/pycutlass/src/cpp/cutlass.cpp
@ -0,0 +1,181 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief binding cutlass C++ APIs to python
+*/
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "builtin_types.h"
+#include "device_launch_parameters.h"
+#include "stddef.h"
+#include "cutlass/cutlass.h"
+
+#include "include/conv/convolution.h"
+#include "include/gemm/gemm.h"
+#include "include/types.h"
+#include "include/layout/layout.h"
+#include "include/tensor_coord.h"
+#include "include/arch.h"
+#include "include/tensor_ref_view.h"
+#include "include/swizzling.h"
+#include "test/conv/convolution.h"
+#include "test/gemm/gemm.h"
+
+
+// Data Types
+#include "library.h"
+
+// compiler
+#include "compiler.h"
+
+
+namespace py = pybind11;
+
+
+PYBIND11_MODULE(cutlass, m) {
+
+    // module doc
+    m.doc() = "cutlass C++ binding";
+
+    //
+    // Bind data type
+    //
+    bind_cutlass_types(m);
+
+    //
+    // Bind layout
+    //
+    bind_layout(m);
+
+    //
+    // Bind tensor coord
+    //
+    bind_tensor_coord(m);
+
+    //
+    // Bind tensor ref
+    //
+    bind_tensor_refs_and_views(m);
+
+    //
+    // Bind opcode
+    //
+    bind_opcode(m);
+
+    //
+    // Bind convolution
+    //
+    py::module_ conv_submodule = m.def_submodule("conv");
+    bind_convolution(conv_submodule);
+
+    //
+    // Bind gemm
+    //
+    py::module_ gemm_submodule = m.def_submodule("gemm");
+    bind_gemm(gemm_submodule);
+
+    //
+    // Bind swizzling
+    //
+    bind_threadblock_swizzle(m);
+
+
+    //
+    // Bind test units
+    //
+    py::module_ test = m.def_submodule("test");
+    py::module_ test_conv = test.def_submodule("conv");
+    bind_convolution_test(test_conv);
+
+    py::module_ test_gemm = test.def_submodule("gemm");
+    bind_gemm_test(test_gemm);
+
+    // data types
+    py::enum_<cutlass::DataType>(m, "dtype")
+        .value("b1", cutlass::DataType::kB1)
+        .value("u2", cutlass::DataType::kU2)
+        .value("u4", cutlass::DataType::kU4)
+        .value("u8", cutlass::DataType::kU8)
+        .value("u16", cutlass::DataType::kU16)
+        .value("u32", cutlass::DataType::kU32)
+        .value("u64", cutlass::DataType::kU64)
+        .value("s2", cutlass::DataType::kS2)
+        .value("s4", cutlass::DataType::kS4)
+        .value("s16", cutlass::DataType::kS16)
+        .value("s64", cutlass::DataType::kS64)
+        .value("cf16", cutlass::DataType::kCF16)
+        .value("cbf16", cutlass::DataType::kCBF16)
+        .value("cf32", cutlass::DataType::kCF32)
+        .value("ctf32", cutlass::DataType::kCTF32)
+        .value("cf64", cutlass::DataType::kCF64)
+        .value("cs2", cutlass::DataType::kCS2)
+        .value("cs4", cutlass::DataType::kCS4)
+        .value("cs8", cutlass::DataType::kCS8)
+        .value("cs16", cutlass::DataType::kCS16)
+        .value("cs32", cutlass::DataType::kCS32)
+        .value("cs64", cutlass::DataType::kCS64)
+        .value("cu2", cutlass::DataType::kCU2)
+        .value("cu4", cutlass::DataType::kCU4)
+        .value("cu8", cutlass::DataType::kCU8)
+        .value("cu16", cutlass::DataType::kCU16)
+        .value("cu32", cutlass::DataType::kCU32)
+        .value("cu64", cutlass::DataType::kCU64)
+        .value("invalid", cutlass::DataType::kInvalid);
+    
+    // layout types
+    py::enum_<cutlass::LayoutType>(m, "layout")
+        .value("ColumnMajorInterleaved2", cutlass::LayoutType::kColumnMajorInterleaved2)
+        .value("RowMajorInterleaved2", cutlass::LayoutType::kRowMajorInterleaved2)
+        .value("ColumnMajorInterleaved64", cutlass::LayoutType::kColumnMajorInterleaved64)
+        .value("RowMajorInterleaved64", cutlass::LayoutType::kRowMajorInterleaved64)
+        .value("TensorNDHWC", cutlass::LayoutType::kTensorNDHWC)
+        .value("TensorNCHW", cutlass::LayoutType::kTensorNCHW)
+        .value("TensorNGHWC", cutlass::LayoutType::kTensorNGHWC)
+        .value("TensorNC64HW64", cutlass::LayoutType::kTensorNC64HW64)
+        .value("TensorC64RSK64", cutlass::LayoutType::kTensorC64RSK64);
+    
+    // transform types
+    py::enum_<cutlass::ComplexTransform>(m, "complex_transform")
+        .value("none", cutlass::ComplexTransform::kNone)
+        .value("conj", cutlass::ComplexTransform::kConjugate);
+
+    //
+    // Compiler
+    //
+    py::class_<cutlass::CompileCache>(m, "CompileCache")
+        .def(py::init<>())
+        .def("at", &cutlass::CompileCache::at)
+        .def("insert", &cutlass::CompileCache::insert)
+        .def("size", &cutlass::CompileCache::size)
+        .def("clear", &cutlass::CompileCache::clear);
+
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/arch.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/arch.h
@ -0,0 +1,59 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind opcode classes to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/arch/mma.h"
+
+namespace py = pybind11;
+
+namespace cutlass {
+enum class OpcodeClass {
+    kSimt, kTensorOp, kWmmaTensorOp, kSparseTensorOp
+};
+}
+
+void bind_opcode(py::module &m) {
+    py::enum_<cutlass::OpcodeClass>(m, "OpClass",
+        R"pbdoc(classification of math operators)pbdoc")
+        .value("Simt", cutlass::OpcodeClass::kSimt, 
+            R"pbdoc(Tag classifying math operators as thread-level operations)pbdoc")
+        .value("TensorOp", cutlass::OpcodeClass::kTensorOp, 
+            R"pbdoc(Tag classifing operators as Tensor Core operations)pbdoc")
+        .value("WmmaTensorOp", cutlass::OpcodeClass::kWmmaTensorOp, 
+            R"pbdoc(Tag classifing operators as WMMA Tensor Core operations)pbdoc")
+        .value("SparseTensorOp", cutlass::OpcodeClass::kSparseTensorOp, 
+            R"pbdoc(Tag classifing operators as sparseTensor Core operations)pbdoc");
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h
@ -0,0 +1,102 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind Convolution problem sizes to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/conv/conv2d_problem_size.h"
+
+namespace py = pybind11;
+
+void bind_conv_problem_size(py::module &m) {
+    //
+    // Conv2d Problem Size: 
+    // include/cutlass/conv/conv2d_problem_sizd.h
+    //
+    py::class_<cutlass::conv::Conv2dProblemSize>(m, "Conv2dProblemSize")
+         // constructors
+        .def(py::init<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, cutlass::conv::Mode, int, int>())
+        .def(py::init<cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::MatrixCoord, cutlass::MatrixCoord, cutlass::conv::Mode, int, int>())
+        // attribute accessors
+        .def_readwrite("N", &cutlass::conv::Conv2dProblemSize::N)
+        .def_readwrite("H", &cutlass::conv::Conv2dProblemSize::H)
+        .def_readwrite("W", &cutlass::conv::Conv2dProblemSize::W)
+        .def_readwrite("C", &cutlass::conv::Conv2dProblemSize::C)
+        .def_readwrite("P", &cutlass::conv::Conv2dProblemSize::P)
+        .def_readwrite("Q", &cutlass::conv::Conv2dProblemSize::Q)
+        .def_readwrite("K", &cutlass::conv::Conv2dProblemSize::K)
+        .def_readwrite("R", &cutlass::conv::Conv2dProblemSize::R)
+        .def_readwrite("S", &cutlass::conv::Conv2dProblemSize::S)
+        .def_readwrite("pad_h", &cutlass::conv::Conv2dProblemSize::pad_h)
+        .def_readwrite("pad_w", &cutlass::conv::Conv2dProblemSize::pad_w)
+        .def_readwrite("stride_h", &cutlass::conv::Conv2dProblemSize::stride_h)
+        .def_readwrite("stride_w", &cutlass::conv::Conv2dProblemSize::stride_w)
+        .def_readwrite("dilation_h", &cutlass::conv::Conv2dProblemSize::dilation_h)
+        .def_readwrite("dilation_w", &cutlass::conv::Conv2dProblemSize::dilation_w)
+        .def_readwrite("mode", &cutlass::conv::Conv2dProblemSize::mode)
+        .def_readwrite("split_k_slices", &cutlass::conv::Conv2dProblemSize::split_k_slices)
+        .def_readwrite("groups", &cutlass::conv::Conv2dProblemSize::groups)
+        // functions
+        .def("reset_split_k_slices", &cutlass::conv::Conv2dProblemSize::reset_split_k_slices)
+        .def("activation_extent", &cutlass::conv::Conv2dProblemSize::activation_extent)
+        .def("filter_extent", &cutlass::conv::Conv2dProblemSize::filter_extent)
+        .def("output_extent", &cutlass::conv::Conv2dProblemSize::output_extent)
+        .def("activation_size", &cutlass::conv::Conv2dProblemSize::activation_size)
+        .def("filter_size", &cutlass::conv::Conv2dProblemSize::filter_size)
+        .def("output_size", &cutlass::conv::Conv2dProblemSize::output_size);
+    
+    // Get tensor size
+    m.def("implicit_gemm_tensor_a_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_a_size));
+    m.def("implicit_gemm_tensor_b_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_b_size));
+    m.def("implicit_gemm_tensor_c_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_c_size));
+
+    // Get tensor extent
+    m.def("implicit_gemm_tensor_a_extent",
+        py::overload_cast<
+            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
+        >(&cutlass::conv::implicit_gemm_tensor_a_extent));
+
+    m.def("implicit_gemm_tensor_b_extent",
+        py::overload_cast<
+            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
+        >(&cutlass::conv::implicit_gemm_tensor_b_extent));
+    
+    m.def("implicit_gemm_tensor_c_extent",
+        py::overload_cast<
+            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
+        >(&cutlass::conv::implicit_gemm_tensor_c_extent));
+    
+    m.def("implicit_gemm_problem_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize &>(&cutlass::conv::implicit_gemm_problem_size));
+    
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/conv/convolution.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/conv/convolution.h
@ -0,0 +1,91 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind convolution related enum types to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "conv_problem_size.h"
+#include "host.h"
+#include "cutlass/conv/convolution.h"
+
+namespace py = pybind11;
+
+void bind_convolution(py::module &m) {
+    //
+    // Enumerate types
+    // cutlass/include/cutlass/conv/convolution.h
+    //
+
+    /// Convolutional operator
+    py::enum_<cutlass::conv::Operator>(m, "Operator", R"pbdoc(Convolutional operator)pbdoc")
+        .value("fprop", cutlass::conv::Operator::kFprop, "Forward propagation")
+        .value("dgrad", cutlass::conv::Operator::kDgrad, "Activation grad")
+        .value("wgrad", cutlass::conv::Operator::kWgrad, "Weight grad");
+
+    /// Distinguishes convolution  from cross correlation
+    py::enum_<cutlass::conv::Mode>(m, "Mode")
+        .value("cross_correlation", cutlass::conv::Mode::kCrossCorrelation)
+        .value("convolution", cutlass::conv::Mode::kConvolution);
+    
+    /// Selects among several implementation variants trading off performance with simplicity
+    py::enum_<cutlass::conv::IteratorAlgorithm>(m, "IteratorAlgorithm",
+        R"pbdoc(Selects among several implementation variants trading off performance with simplicity)pbdoc")
+        .value("analytic", cutlass::conv::IteratorAlgorithm::kAnalytic, R"pbdoc(functionally correct in all cases but lower performance)pbdoc")
+        .value("optimized", cutlass::conv::IteratorAlgorithm::kOptimized, R"pbdoc(optimized for R <= 32, S <= 32 and unity-stride dgrad)pbdoc")
+        .value("fixed_channels", cutlass::conv::IteratorAlgorithm::kFixedChannels, R"pbdoc(Analytic algorithm optimized for fixed channel count (C == AccessSize))pbdoc")
+        .value("few_channels", cutlass::conv::IteratorAlgorithm::kFewChannels, R"pbdoc(Analytic algorithm optimized for few channels (C divisible by AccessSize))pbdoc");
+    
+    /// Distinguishes among partial specializations that accelerate certain problems where convolution
+    /// stride is unit.
+    py::enum_<cutlass::conv::StrideSupport>(m, "StrideSupport",
+        R"pbdoc(Distinguishes among partial specializations that accelerate certain problems where convolution
+        stride is unit.)pbdoc")
+        .value("strided", cutlass::conv::StrideSupport::kStrided, R"pbdoc(arbitrary convolution stride)pbdoc")
+        .value("unity", cutlass::conv::StrideSupport::kUnity, R"pbdoc(unit convolution stride)pbdoc");
+    
+    /// Identifies split-K mode
+    py::enum_<cutlass::conv::SplitKMode>(m, "SplitKMode")
+        .value("None", cutlass::conv::SplitKMode::kNone)
+        .value("Serial", cutlass::conv::SplitKMode::kSerial)
+        .value("Parallel", cutlass::conv::SplitKMode::kParallel);
+    
+    // Conv problem sizes
+    bind_conv_problem_size(m);
+
+    //
+    // host helper functions
+    //
+    py::module_ host_submodule = m.def_submodule("host");
+    bind_conv_host_helper(host_submodule);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/conv/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/conv/host.h
@ -0,0 +1,54 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind conv host helpers to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/layout/tensor.h"
+
+namespace py = pybind11;
+
+
+void bind_conv_host_helper(py::module &m) {
+
+    /// reorder operand B for interleaved layout
+    m.def("reorder_convK", [](
+        cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> dest,
+        cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> src,
+        cutlass::conv::Operator conv_op, const cutlass::conv::Conv2dProblemSize & problem_size) {
+            cutlass::gemm::GemmCoord implicit_problem_size = cutlass::conv::implicit_gemm_problem_size(conv_op, problem_size);
+            cutlass::reorder_convK<32>(dest, src, implicit_problem_size);
+        });
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h
@ -0,0 +1,77 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind gemm related enum types to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/gemm/gemm.h"
+#include "host.h"
+
+namespace py = pybind11;
+
+void bind_gemm(py::module &m) {
+    //
+    // Enumerate types
+    // cutlass/gemm/gemm.h
+
+    py::enum_<cutlass::gemm::GemmUniversalMode>(m, "Mode")
+        .value("Gemm", cutlass::gemm::GemmUniversalMode::kGemm, "Ordinary GEMM & GEMM Split-K serial")
+        .value("GemmSplitKParallel", cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, "GEMM Split-K parallel")
+        .value("Batched", cutlass::gemm::GemmUniversalMode::kBatched, "Batched GEMM")
+        .value("Array", cutlass::gemm::GemmUniversalMode::kArray)
+        .value("Invalid", cutlass::gemm::GemmUniversalMode::kInvalid);
+    
+    /// GemmCoord is a structure that specifies a location within the coordiate space of a GEMM problem
+    py::class_<cutlass::gemm::GemmCoord>(m, "GemmCoord")
+        .def(py::init<int, int, int>())
+        .def("m", py::overload_cast<>(&cutlass::gemm::GemmCoord::m))
+        .def("n", py::overload_cast<>(&cutlass::gemm::GemmCoord::n))
+        .def("k", py::overload_cast<>(&cutlass::gemm::GemmCoord::k))
+        // get tensor coords
+        .def("mk", 
+            [](const cutlass::gemm::GemmCoord & problem_size) {
+                return cutlass::MatrixCoord(problem_size.mk());
+            })
+        .def("kn", 
+            [](const cutlass::gemm::GemmCoord & problem_size) {
+                return cutlass::MatrixCoord(problem_size.kn());
+            })
+        .def("mn", 
+            [](const cutlass::gemm::GemmCoord & problem_size) {
+                return cutlass::MatrixCoord(problem_size.mn());
+            });
+    
+    py::module_ host_submodule = m.def_submodule("host");
+    bind_gemm_host_helper(host_submodule);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/gemm/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/gemm/host.h
@ -0,0 +1,47 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind gemm host helpers to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/layout/tensor.h"
+
+namespace py = pybind11;
+
+
+void bind_gemm_host_helper(py::module &m) {
+    m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::RowMajorInterleaved<32>>);
+    m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::ColumnMajorInterleaved<32>>);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/layout/layout.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/layout/layout.h
@ -0,0 +1,47 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind CUTLASS layouts to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "tensor.h"
+#include "matrix.h"
+
+
+namespace py = pybind11;
+
+void bind_layout(py::module &m) {
+    bind_tensor_layout(m);
+    bind_matrix_layout(m);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/layout/matrix.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/layout/matrix.h
@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind Matrix layouts to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/layout/matrix.h"
+
+namespace py = pybind11;
+
+void bind_matrix_layout(py::module &m) {
+    //
+    // Matrix layouts
+    // cutlass/layout/matrix.h
+    //
+
+    py::class_<cutlass::layout::RowMajor>(m, "RowMajor", R"pbdoc(
+        Mapping function for row-major matrices.
+    )pbdoc")
+        .def_static("packed", &cutlass::layout::RowMajor::packed, 
+            py::arg("extent"), 
+            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
+        .def("stride", [](const cutlass::layout::RowMajor & layout){
+            return layout.stride().at(0);
+        }, R"pbdoc(Returns the stride of the layout)pbdoc");
+
+    py::class_<cutlass::layout::ColumnMajor>(m, "ColumnMajor", R"pbdoc(
+        Mapping function for column-major matrices.
+    )pbdoc")
+        .def_static("packed", &cutlass::layout::ColumnMajor::packed, 
+            py::arg("extent"),
+            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc" )
+        .def("stride", [](const cutlass::layout::ColumnMajor & layout){
+            return layout.stride().at(0);
+        }, R"pbdoc(Returns the stride of the layout)pbdoc");
+
+    py::class_<cutlass::layout::RowMajorInterleaved<32>>(m, "RowMajorInterleaved32",
+        R"pbdoc(Mapping function for interleaved matrices. Matrix is structured 
+        as row-major arrangement of fixed-size columns 32)pbdoc")
+        .def_static("packed", &cutlass::layout::RowMajorInterleaved<32>::packed,
+            py::arg("extent"), 
+            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
+        .def("stride", [](const cutlass::layout::RowMajorInterleaved<32> & layout){
+            return layout.stride().at(0);
+        }, R"pbdoc(Returns the stride of the layout)pbdoc");
+
+    py::class_<cutlass::layout::ColumnMajorInterleaved<32>>(m, "ColumnMajorInterleaved32",
+        R"pbdoc(Mapping function for interleaved matrices. Matrix is structured 
+        as column-major arrangement of fixed-size rows 32)pbdoc")
+        .def_static("packed", &cutlass::layout::ColumnMajorInterleaved<32>::packed,
+            py::arg("extent"), 
+            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
+        .def("stride", [](const cutlass::layout::ColumnMajorInterleaved<32> & layout){
+            return layout.stride().at(0);
+        }, R"pbdoc(Returns the stride of the layout)pbdoc");
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/layout/tensor.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/layout/tensor.h
@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind Tensor layouts to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/layout/tensor.h"
+
+namespace py = pybind11;
+
+void bind_tensor_layout(py::module &m) {
+    //
+    // Tensor layouts
+    // cutlass/include/cutlass/layout/tensor.h
+    //
+
+    /// Mapping function for 4-D NHWC tensors.
+    py::class_<cutlass::layout::TensorNHWC>(m, "TensorNHWC",
+        R"pbdoc(Mapping function for 4-D NHWC tensors)pbdoc")
+        .def_static("packed", &cutlass::layout::TensorNHWC::packed,
+            py::arg("extent"),
+            R"pbdoc(Helper returns a layout to a tightly packed NHWC tensor)pbdoc")
+        .def("stride", py::overload_cast<>(&cutlass::layout::TensorNHWC::stride),
+            R"pbdoc(Returns the stride of the layout)pbdoc");
+    
+    /// Mapping function for 4-D NC/xHWx tensors.
+    py::class_<cutlass::layout::TensorNCxHWx<32>>(m, "TensorNC32HW32",
+        R"pbdoc(Mapping function for 4-D NC/32HW32 tensors)pbdoc")
+        .def_static("packed", &cutlass::layout::TensorNCxHWx<32>::packed,
+            py::arg("extent"),
+            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
+        .def("stride", py::overload_cast<>(&cutlass::layout::TensorNCxHWx<32>::stride),
+            R"pbdoc(Returns the stride of the layout)pbdoc");
+    
+    /// Mapping function for 4-D CxRSKx tensors.
+    py::class_<cutlass::layout::TensorCxRSKx<32>>(m, "TensorC32RSK32",
+        R"pbdoc(Mapping function for 4-D C32RSK32 tensors)pbdoc")
+        .def_static("packed", &cutlass::layout::TensorCxRSKx<32>::packed,
+            py::arg("extent"),
+            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
+        .def("stride", py::overload_cast<>(&cutlass::layout::TensorCxRSKx<32>::stride),
+            R"pbdoc(Returns the stride of the layout)pbdoc");
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h
@ -0,0 +1,152 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind threadblock swizzling to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/conv/threadblock/threadblock_swizzle.h"
+
+#include <boost/core/demangle.hpp>
+#include <cuda_runtime.h>
+
+namespace py = pybind11;
+
+template<typename T>
+void bind_identity_swizzle(py::module & m, std::string name) {
+    py::class_<T>(m, name.c_str(),
+        R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc")
+        .def(py::init<>())
+        .def("get_tiled_shape",
+            py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
+                &T::get_tiled_shape, py::const_
+            ), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
+            R"pbdoc(Returns the shape of the problem in units of logical tiles
+            
+            :param problem_size: gemm(M, N, K)
+            :type problem_size: :class:`cutlass.gemm.GemmCoord`
+            )pbdoc")
+        .def("get_tiled_shape",
+            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
+                &T::get_tiled_shape, py::const_
+            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
+            R"pbdoc(Returns the shape of the problem in units of logical tiles
+            
+            :param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
+            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
+            )pbdoc")
+        .def("get_tiled_shape",
+            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv3dProblemSize&, cutlass::gemm::GemmCoord, int>(
+                &T::get_tiled_shape, py::const_
+            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
+            R"pbdoc(Returns the shape of the problem in units of logical tiles
+            
+            :param problem_size: Implicit gemm problem size conv_operator(NZPQK, NDHWC, KTRSC)
+            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
+            )pbdoc")
+        // TODO: the returned dim3 is not usable in python
+        .def("get_grid_shape", &T::get_grid_shape,
+            py::arg("tiled_shape"), 
+            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
+        .def("tag", [](const T & swizzle){
+            return boost::core::demangle(typeid(T).name());
+        }, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc");
+}
+
+template<typename T>
+void bind_swizzle(py::module & m, std::string name, std::string doc) {
+    py::class_<T>(m, name.c_str(), doc.c_str())
+        .def(py::init<>())
+        .def("get_tiled_shape",
+            py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
+                &T::get_tiled_shape, py::const_
+            ), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
+            R"pbdoc(Returns the shape of the problem in units of logical tiles
+            
+            :param problem_size: gemm(M, N, K)
+            :type problem_size: :class:`cutlass.gemm.GemmCoord`
+            )pbdoc")
+        .def("get_grid_shape", &T::get_grid_shape,
+            py::arg("tiled_shape"), 
+            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
+        .def("tag", [](const T & swizzle){
+            return boost::core::demangle(typeid(T).name());
+        }, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc");
+}
+
+template<typename T>
+void bind_dgrad_swizzle(py::module & m, std::string name) {
+    py::class_<T>(m, name.c_str(),
+        R"pbdoc(Threadblock swizzling function for strided dgrad convolution)pbdoc")
+        .def(py::init<>())
+        .def("get_tiled_shape",
+            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
+                &T::get_tiled_shape, py::const_
+            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
+            R"pbdoc(Returns the shape of the problem in units of logical tiles
+            
+            :param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
+            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
+            )pbdoc")
+        .def("get_grid_shape", [](const T & swizzle, cutlass::gemm::GemmCoord tiled_shape) {
+            return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
+        }, py::arg("tiled_shape"), 
+            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
+        .def("tag", [](const T & swizzle){
+            return boost::core::demangle(typeid(T).name());
+        }, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc");
+}
+
+void bind_threadblock_swizzle(py::module &m) {
+
+    py::class_<dim3>(m, "dim3",
+        R"pbdoc(A int3 type xyz contains three integers)pbdoc")
+        .def(py::init<int, int, int>(),
+            py::arg("x"), py::arg("y"), py::arg("z"))
+        .def_readwrite("x", &dim3::x, R"pbdoc(get value x)pbdoc")
+        .def_readwrite("y", &dim3::y, R"pbdoc(get value y)pbdoc")
+        .def_readwrite("z", &dim3::z, R"pbdoc(get value z)pbdoc");
+
+    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>>(m, "IdentitySwizzle1");
+    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>>(m, "IdentitySwizzle2");
+    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>>(m, "IdentitySwizzle4");
+    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>>(m, "IdentitySwizzle8");
+
+    bind_swizzle<cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle>(m, "HorizontalSwizzle",  R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc");
+    bind_swizzle<cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle>(m, "BatchedIdentitySwizzle",  R"pbdoc(Threadblock swizzling function for batched GEMMs)pbdoc");
+
+    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>>(m, "StridedDgradIdentitySwizzle1");
+    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>>(m, "StridedDgradIdentitySwizzle4");
+    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle>(m, "StridedDgradHorizontalSwizzle");
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/tensor_coord.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/tensor_coord.h
@ -0,0 +1,72 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind Tensor Coord to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/tensor_coord.h"
+
+namespace py = pybind11;
+
+void bind_tensor_coord(py::module &m) {
+    //
+    // Tensor Coords
+    // cutlass/include/cutlass/tensor_coord.h
+    //
+
+    /// Defines a canonical 4D coordinate used by tensor operations.
+    py::class_<cutlass::Tensor4DCoord>(m, "Tensor4DCoord",
+        R"pbdoc(Defines a canonical 4D coordinate used by tensor operations)pbdoc")
+        .def(py::init<int, int, int, int>(),
+            py::arg("n"), py::arg("h"), py::arg("w"), py::arg("c"),
+            R"pbdoc(Helper to construct from N, H, W, and C)pbdoc");
+    
+    py::class_<cutlass::Coord<3>>(m, "Tensor3DCoord",
+        R"pbdoc(Defines a canonical 3D coordinate used by tensor operations)pbdoc")
+        .def("at", py::overload_cast<int>(&cutlass::Coord<3>::at),
+            py::arg("dim"),
+            R"pbdoc(Gets the index of a given Coord element)pbdoc");
+
+    // Matrix Size
+    py::class_<cutlass::MatrixCoord>(m, "MatrixCoord",
+        R"pbdoc(MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
+        expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.)pbdoc")
+        .def(py::init<int, int>(),
+            py::arg("row"), py::arg("column"), R"pbdoc(Helper to construct from a row and column)pbdoc")
+        .def("row", py::overload_cast<>(&cutlass::MatrixCoord::row),
+            R"pbdoc(Returns the row of the coordinate)pbdoc")
+        .def("column", py::overload_cast<>(&cutlass::MatrixCoord::column),
+            R"pbdoc(Returns the column of the coordinate)pbdoc");
+
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h
@ -0,0 +1,102 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSE<cutlass::TensorRef<QUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind TensorRef and View to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "types.h"
+
+
+template<typename T, typename L, typename TF>
+void bind_tensor_ref_view(py::module &m, std::string name) {
+    py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
+        .def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
+            T* ptr = reinterpret_cast< T*>(address);
+            new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
+        })
+        .def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
+            T* ptr = tensor_ref.data();
+            return int64_t(ptr);
+        })
+        .def("layout", py::overload_cast<>(&cutlass::TensorRef<T, L>::layout));
+    
+    m.def("get_tensor_ref", [](int64_t address, TF data, const L& layout_) {
+        T* ptr = reinterpret_cast<T*>(address);
+        cutlass::TensorRef<T, L> tensor_ref = cutlass::TensorRef<T, L>(ptr, layout_);
+        return tensor_ref;
+    });
+    
+    py::class_<cutlass::TensorView<T, L>>(m, ("TensorView" + name).c_str())
+        .def(py::init<const cutlass::TensorRef<T, L>&, const typename L::TensorCoord &>());
+}
+
+
+void bind_tensor_refs_and_views(py::module &m) {
+
+    /// float
+    bind_tensor_ref_view<float, cutlass::layout::RowMajor, cutlass::float32>(m, "F32RowMajor");
+    bind_tensor_ref_view<float, cutlass::layout::ColumnMajor, cutlass::float32>(m, "F32ColumnMajor");
+    bind_tensor_ref_view<float, cutlass::layout::TensorNHWC, cutlass::float32>(m, "F32NHWC");
+
+    /// double
+    bind_tensor_ref_view<double, cutlass::layout::RowMajor, cutlass::float64>(m, "F64RowMajor");
+    bind_tensor_ref_view<double, cutlass::layout::ColumnMajor, cutlass::float64>(m, "F64ColumnMajor");
+    bind_tensor_ref_view<double, cutlass::layout::TensorNHWC, cutlass::float64>(m, "F64NHWC");
+
+    // half_t
+    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t>(m, "F16RowMajor");
+    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t>(m, "F16ColumnMajor");
+    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::TensorNHWC, cutlass::half_t>(m, "F16NHWC");
+
+    // bfloat16
+    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t>(m, "BF16RowMajor");
+    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::ColumnMajor, cutlass::bfloat16_t>(m, "BF16ColumnMajor");
+    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::TensorNHWC, cutlass::bfloat16_t>(m, "BF16NHWC");
+
+    // int8_t
+    bind_tensor_ref_view<int8_t, cutlass::layout::RowMajorInterleaved<32>, cutlass::int8>(m, "S8RowMajorInterleaved32");
+    bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajorInterleaved<32>, cutlass::int8>(m, "S8ColumnMajorInterleaved32");
+    bind_tensor_ref_view<int8_t, cutlass::layout::RowMajor, cutlass::int8>(m, "S8RowMajor");
+    bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajor, cutlass::int8>(m, "S8ColumnMajor");
+    bind_tensor_ref_view<int8_t, cutlass::layout::TensorNHWC, cutlass::int8>(m, "S8NHWC");
+    bind_tensor_ref_view<int8_t, cutlass::layout::TensorNCxHWx<32>, cutlass::int8>(m, "S8NC32HW32");
+    bind_tensor_ref_view<int8_t, cutlass::layout::TensorCxRSKx<32>, cutlass::int8>(m, "S8C32RSK32");
+
+    // int32_t
+    bind_tensor_ref_view<int32_t, cutlass::layout::RowMajor, cutlass::int32>(m, "S32RowMajor");
+    bind_tensor_ref_view<int32_t, cutlass::layout::ColumnMajor, cutlass::int32>(m, "S32ColumnMajor");
+    bind_tensor_ref_view<int32_t, cutlass::layout::TensorNHWC, cutlass::int32>(m, "S32NHWC");
+}
--- a/tools/library/scripts/pycutlass/src/cpp/include/types.h
+++ b/tools/library/scripts/pycutlass/src/cpp/include/types.h
@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind CUTLASS types to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/half.h"
+
+
+namespace py = pybind11;
+
+namespace cutlass {
+
+/// IEEE 32-bit signed integer
+struct alignas(1) int8 {
+    int8_t storage;
+    explicit int8(int x) {
+        storage = int8_t(x);
+    }
+    explicit int8(float x) {
+        storage = int8_t(x);
+    }
+
+    int8_t c_value(){return storage;}
+};
+
+/// IEEE 32-bit signed integer
+struct alignas(4) int32 {
+    int storage;
+    explicit int32(int x) {
+        storage = x;
+    }
+    explicit int32(float x) {
+        storage = int(x);
+    }
+
+    int c_value(){return storage;}
+};
+/// IEEE single-precision floating-point type
+struct alignas(4) float32 {
+    float storage;
+    explicit float32(float x) {
+        storage = x;
+    }
+    explicit float32(int x) {
+        storage = float(x);
+    }
+    float c_value(){return storage;}
+};
+/// IEEE double-precision floating-point type
+struct alignas(4) float64 {
+    double storage;
+    explicit float64(float x) {
+        storage = double(x);
+    }
+    explicit float64(int x) {
+        storage = double(x);
+    }
+    double c_value(){return storage;}
+};
+}
+
+void bind_cutlass_types(py::module &m) {
+
+    // s8
+    py::class_<cutlass::int8>(m, "int8")
+        .def(py::init<float>())
+        .def(py::init<int>())
+        .def_readwrite("storage", &cutlass::int8::storage)
+        .def("value", &cutlass::int8::c_value);
+
+    // s32
+    py::class_<cutlass::int32>(m, "int32")
+        .def(py::init<float>())
+        .def(py::init<int>())
+        .def_readwrite("storage", &cutlass::int32::storage)
+        .def("value", &cutlass::int32::c_value);
+
+    // f16
+    py::class_<cutlass::half_t>(m, "float16")
+        .def(py::init<float>())
+        .def(py::init<double>())
+        .def(py::init<int>())
+        .def(py::init<unsigned>())
+        .def_readwrite("storage", &cutlass::half_t::storage)
+        .def("value", [](const cutlass::half_t& value) {return value;});
+    
+    // bf16
+    py::class_<cutlass::bfloat16_t>(m, "bfloat16")
+        .def(py::init<float>())
+        .def(py::init<int>())
+        .def_readwrite("storage", &cutlass::bfloat16_t::storage)
+        .def("value", [](const cutlass::bfloat16_t& value) {return value;});
+
+    // f32
+    py::class_<cutlass::float32>(m, "float32")
+        .def(py::init<float>())
+        .def(py::init<int>())
+        .def_readwrite("storage", &cutlass::float32::storage)
+        .def("value", &cutlass::float32::c_value);
+
+    // tf32
+    py::class_<cutlass::tfloat32_t>(m, "tfloat32")
+        .def(py::init<float>())
+        .def(py::init<int>())
+        .def_readwrite("storage", &cutlass::tfloat32_t::storage)
+        .def("value", [](const cutlass::tfloat32_t& value) {return value;});
+    
+    // f64
+    py::class_<cutlass::float64>(m, "float64")
+        .def(py::init<float>())
+        .def(py::init<int>())
+        .def_readwrite("storage", &cutlass::float64::storage)
+        .def("value", &cutlass::float64::c_value);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/library.h
+++ b/tools/library/scripts/pycutlass/src/cpp/library.h
@ -0,0 +1,32 @@
+#include <cutlass/complex.h>
+
+namespace cutlass {
+
+/// ENUM class for datatypes
+enum class DataType {
+    kB1, kU2, kU4, kU8,
+    kU16, kU32, kU64, kS2,
+    kS4, kS8, kS16, kS32,
+    kS64, kF16, kBF16, kF32,
+    kTF32, kF64, kCF16, kCBF16,
+    kCF32, kCTF32, kCF64, kCS2,
+    kCS4, kCS8, kCS16, kCS32, 
+    kCS64, kCU2, kCU4, kCU8,
+    kCU16, kCU32, kCU64, kInvalid
+};
+
+/// ENUM class for LayoutTypes
+enum class LayoutType {
+    kColumnMajor, kRowMajor,
+    kColumnMajorInterleaved2, kRowMajorInterleaved2,
+    kColumnMajorInterleaved32, kRowMajorInterleaved32,
+    kColumnMajorInterleaved64, kRowMajorInterleaved64,
+    kTensorNHWC, kTensorNDHWC, kTensorNCHW, kTensorNGHWC,
+    kTensorNC32HW32, kTensorNC64HW64, kTensorC32RSK32,
+    kTensorC64RSK64
+};
+
+/// ENUM class for opcode class
+
+
+} // namespace cutlass
--- a/tools/library/scripts/pycutlass/src/cpp/test/conv/conv_problems.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/conv/conv_problems.h
@ -0,0 +1,54 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind convolution problems to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+
+#include "unit/conv/device/conv2d_problems.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+namespace py = pybind11;
+
+PYBIND11_MAKE_OPAQUE(std::vector<cutlass::conv::Conv2dProblemSize>);
+
+void bind_conv_problem_size_test(py::module &m) {
+    
+    py::bind_vector<std::vector<cutlass::conv::Conv2dProblemSize>>(m, "Conv2dProblemVector")
+        .def("size", &std::vector<cutlass::conv::Conv2dProblemSize>::size);
+    // Get Conv2d problem sizes
+    py::class_<test::conv::device::TestbedConv2dProblemSizes>(m, "TestbedConv2dProblemSizes")
+        .def(py::init<int>())
+        .def_readonly("conv2d_default_sizes", &test::conv::device::TestbedConv2dProblemSizes::conv2d_default_sizes);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/test/conv/convolution.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/conv/convolution.h
@ -0,0 +1,49 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind convolution related types to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "conv_problems.h"
+#include "host.h"
+
+namespace py = pybind11;
+
+void bind_convolution_test(py::module &m) {
+    // Conv problem sizes
+    bind_conv_problem_size_test(m);
+
+    py::module_ host_submodule = m.def_submodule("host");
+    bind_conv_host_references(host_submodule);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/test/conv/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/conv/host.h
@ -0,0 +1,180 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind Convolution host test helpers to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+#include "unit/conv/device/cache_testbed_output.h"
+
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+namespace py = pybind11;
+
+
+template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
+void bind_conv2d_host(py::module &m) {
+    m.def("conv2d", \
+        &cutlass::reference::host::Conv2d< \
+            Ta, La, Tb, Lb, Tc, Lc, Te, Tacc>);
+    
+    m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
+}
+
+template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
+void bind_conv2d_host_sat(py::module &m) {
+    m.def("conv2d", \
+        &cutlass::reference::host::Conv2d< \
+            Ta, La, Tb, Lb, Tc, Lc, Te, Tacc, cutlass::NumericConverterClamp<Tc, Te>>);
+    
+    m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
+}
+
+template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
+void bind_conv2d_host_nhwc(py::module &m) {
+    bind_conv2d_host<
+        Ta, cutlass::layout::TensorNHWC, 
+        Tb, cutlass::layout::TensorNHWC, 
+        Tc, cutlass::layout::TensorNHWC, 
+        Tacc, Te>(m);
+}
+
+template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
+void bind_conv2d_host_nc32hw32(py::module &m) {
+    bind_conv2d_host_sat<
+        Ta, cutlass::layout::TensorNCxHWx<32>,
+        Tb, cutlass::layout::TensorCxRSKx<32>,
+        Tc, cutlass::layout::TensorNCxHWx<32>,
+        Tacc, Te>(m);
+}
+
+
+template<typename T, typename Layout>
+void bind_tensor_equals(py::module &m) {
+    m.def("equals", py::overload_cast<
+        const cutlass::TensorView<T, Layout>&, const cutlass::TensorView<T, Layout>&>(
+            &cutlass::reference::host::TensorEquals<T, Layout>
+        ));
+}
+
+#define BIND_TENSOR_HASH(Element, Layout) { \
+    m.def("TensorHash", &test::conv::device::TensorHash<Element, Layout>, py::arg("view"), py::arg("hash") = test::conv::device::CRC32(), py::arg("crc")=uint32_t()); \
+}
+
+void bind_conv_host_references(py::module &m) {
+    //
+    // Conv2d reference on host
+    // tools/util/include/cutlass/util/reference/host/convolution.h
+
+    /// double
+    bind_conv2d_host_nhwc<double, double, double, double, double>(m);
+    /// float
+    bind_conv2d_host_nhwc<float, float, float, float, float>(m);
+    /// half
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, cutlass::half_t>(m);
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, float>(m);
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, cutlass::half_t>(m);
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, float>(m);
+    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, float>(m);
+    /// bfloat16
+    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, cutlass::bfloat16_t>(m);
+    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
+    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, cutlass::bfloat16_t>(m);
+    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
+    /// s8
+    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
+    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
+    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
+    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
+    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
+    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
+
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
+    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
+
+    //
+    // Compare whether two tensors are equal
+    //
+    /// double
+    bind_tensor_equals<double, cutlass::layout::TensorNHWC>(m);
+    /// float
+    bind_tensor_equals<float, cutlass::layout::TensorNHWC>(m);
+    /// half
+    bind_tensor_equals<cutlass::half_t, cutlass::layout::TensorNHWC>(m);
+    /// bfloat16
+    bind_tensor_equals<cutlass::bfloat16_t, cutlass::layout::TensorNHWC>(m);
+    /// s32
+    bind_tensor_equals<int32_t, cutlass::layout::TensorNHWC>(m);
+    bind_tensor_equals<int32_t, cutlass::layout::TensorNCxHWx<32>>(m);
+    /// s8
+    bind_tensor_equals<int8_t, cutlass::layout::TensorNHWC>(m);
+    bind_tensor_equals<int8_t, cutlass::layout::TensorNCxHWx<32>>(m);
+
+    /// Cache
+    py::class_<test::conv::device::CachedTestKey>(m, "CachedTestKey")
+        .def(py::init<>())
+        .def(py::init<std::string, std::string, std::string, uint32_t, uint32_t, uint32_t>());
+    
+    py::class_<test::conv::device::CachedTestResult>(m, "CachedTestResult")
+        .def(py::init<>())
+        .def(py::init<uint32_t>())
+        .def_readwrite("D", &test::conv::device::CachedTestResult::D);
+    
+    py::class_<test::conv::device::CachedTestResultListing>(m, "CachedTestResultListing")
+        .def(py::init<const std::string &>())
+        .def("find", &test::conv::device::CachedTestResultListing::find)
+        .def("append", &test::conv::device::CachedTestResultListing::append)
+        .def("write", &test::conv::device::CachedTestResultListing::write);
+    
+    py::class_<test::conv::device::CRC32>(m, "CRC32")
+        .def(py::init<>());
+    
+    BIND_TENSOR_HASH(double, cutlass::layout::TensorNHWC)
+    BIND_TENSOR_HASH(float, cutlass::layout::TensorNHWC);
+    BIND_TENSOR_HASH(cutlass::half_t, cutlass::layout::TensorNHWC);
+    BIND_TENSOR_HASH(cutlass::bfloat16_t, cutlass::layout::TensorNHWC);
+    BIND_TENSOR_HASH(int32_t, cutlass::layout::TensorNHWC);
+    BIND_TENSOR_HASH(int8_t, cutlass::layout::TensorNCxHWx<32>);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/test/gemm/gemm.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/gemm/gemm.h
@ -0,0 +1,45 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind gemm test to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "host.h"
+
+namespace py = pybind11;
+
+void bind_gemm_test(py::module &m) {
+    py::module_ host_submodule = m.def_submodule("host");
+    bind_gemm_host_reference(host_submodule);
+}
--- a/tools/library/scripts/pycutlass/src/cpp/test/gemm/host.h
+++ b/tools/library/scripts/pycutlass/src/cpp/test/gemm/host.h
@ -0,0 +1,431 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Bind gemm test host functions to python
+*/
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl_bind.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/host_reorder.h"
+
+#include "cutlass/functional.h"
+
+namespace py = pybind11;
+
+
+template<
+    typename ElementA, typename LayoutA,
+    typename ElementB, typename LayoutB,
+    typename ElementC, typename LayoutC,
+    typename AccumulatorType, typename ComputeType, 
+    typename InnerProductOp>
+void bind_host_gemm_saturate(py::module &m) {
+    m.def("gemm_saturate", py::overload_cast<
+        cutlass::gemm::GemmCoord, ComputeType,
+        cutlass::TensorRef<ElementA, LayoutA>,
+        cutlass::TensorRef<ElementB, LayoutB>,
+        ComputeType,
+        cutlass::TensorRef<ElementC, LayoutC>,
+        cutlass::TensorRef<ElementC, LayoutC>,
+        AccumulatorType>(
+            &cutlass::reference::host::compute_gemm<
+                        ElementA, LayoutA,
+                        ElementB, LayoutB,
+                        ElementC, LayoutC,
+                        ComputeType,
+                        AccumulatorType,
+                        InnerProductOp, 
+                        cutlass::NumericConverterClamp<ElementC, AccumulatorType>>
+                        ));
+}
+
+template<
+    typename ElementA, typename LayoutA,
+    typename ElementB, typename LayoutB,
+    typename ElementC, typename LayoutC,
+    typename AccumulatorType, typename ComputeType, 
+    typename InnerProductOp>
+void bind_host_gemm(py::module &m) {
+    m.def("gemm", py::overload_cast<
+        cutlass::gemm::GemmCoord, ComputeType,
+        cutlass::TensorRef<ElementA, LayoutA>,
+        cutlass::TensorRef<ElementB, LayoutB>,
+        ComputeType,
+        cutlass::TensorRef<ElementC, LayoutC>,
+        cutlass::TensorRef<ElementC, LayoutC>,
+        AccumulatorType>(
+            &cutlass::reference::host::compute_gemm<
+                        ElementA, LayoutA,
+                        ElementB, LayoutB,
+                        ElementC, LayoutC,
+                        ComputeType,
+                        AccumulatorType,
+                        InnerProductOp, 
+                        cutlass::NumericConverter<ElementC, AccumulatorType>>
+                        ));
+}
+
+
+template<
+    typename ElementA, typename ElementB, typename ElementC,
+    typename AccumulatorType, typename ComputeType>
+void bind_host_gemm_multiply_add(py::module &m) {
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        ComputeType, AccumulatorType,
+        cutlass::multiply_add<AccumulatorType>>(m);
+    
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+}
+
+template<
+    typename ElementA, typename ElementB, typename ElementC,
+    typename AccumulatorType, typename ComputeType>
+void bind_host_gemm_multiply_add_saturate(py::module &m) {
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        ComputeType, AccumulatorType,
+        cutlass::multiply_add<AccumulatorType>>(m);
+    
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::RowMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::RowMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajor, 
+        ElementB, cutlass::layout::ColumnMajor, 
+        ElementC, cutlass::layout::ColumnMajor, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+}
+
+
+template<
+    typename ElementA, typename ElementB, typename ElementC,
+    typename AccumulatorType, typename ComputeType>
+void bind_host_gemm_multiply_add_interleaved(py::module &m) {
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        ComputeType, AccumulatorType,
+        cutlass::multiply_add<AccumulatorType>>(m);
+    
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+}
+
+template<
+    typename ElementA, typename ElementB, typename ElementC,
+    typename AccumulatorType, typename ComputeType>
+void bind_host_gemm_multiply_add_saturate_interleaved(py::module &m) {
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        ComputeType, AccumulatorType,
+        cutlass::multiply_add<AccumulatorType>>(m);
+    
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+
+    bind_host_gemm_saturate<
+        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
+        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
+        AccumulatorType, ComputeType, 
+        cutlass::multiply_add<AccumulatorType>>(m);
+}
+
+#define BIND_TENSOR_EQUAL(Element, Layout) { \
+    m.def("equals", py::overload_cast< \
+        const cutlass::TensorView<Element, Layout>&, const cutlass::TensorView<Element, Layout>&>( \
+        &cutlass::reference::host::TensorEquals<Element, Layout>)); \
+}
+
+void bind_gemm_host_reference(py::module &m) {
+
+    /// double
+    bind_host_gemm_multiply_add<double, double, double, double, double>(m);
+    /// float
+    bind_host_gemm_multiply_add<float, float, float, float, float>(m);
+    /// half_t
+    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
+    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
+    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
+    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, float, float>(m);
+    /// bfloat16
+    bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
+    bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
+
+    /// s8
+    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
+
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
+
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
+
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
+    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
+
+    // float
+    BIND_TENSOR_EQUAL(float, cutlass::layout::RowMajor);
+    BIND_TENSOR_EQUAL(float, cutlass::layout::ColumnMajor);
+
+    // double
+    BIND_TENSOR_EQUAL(double, cutlass::layout::RowMajor);
+    BIND_TENSOR_EQUAL(double, cutlass::layout::ColumnMajor);
+
+    // half_t
+    BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::RowMajor);
+    BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::ColumnMajor);
+
+    // bfloat16
+    BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::RowMajor);
+    BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::ColumnMajor);
+
+    // int32_t
+    BIND_TENSOR_EQUAL(int32_t, cutlass::layout::RowMajor);
+    BIND_TENSOR_EQUAL(int32_t, cutlass::layout::ColumnMajor);
+
+    // int8_t
+    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajor);
+    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajor);
+    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajorInterleaved<32>);
+    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajorInterleaved<32>);
+    
+
+}
--- a/tools/library/scripts/pycutlass/src/pycutlass/init.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/init.py
@ -0,0 +1,31 @@
+from pycutlass.type import *
+from pycutlass.tensor_ref import *
+from pycutlass.operation import *
+from pycutlass.epilogue import *
+from pycutlass.compiler import ArtifactManager
+from pycutlass.memory_manager import *
+from pycutlass.arguments import *
+from pycutlass.library import *
+from pycutlass.c_types import *
+from pycutlass.gemm_operation import *
+from pycutlass.conv2d_operation import *
+from pycutlass.compiler import *
+from pycutlass.utils import *
+from pycutlass.frontend import *
+from pycutlass.reduction_operation import *
+from pycutlass.compiler import *
+
+# module-wide variables
+
+import sys
+this = sys.modules[__name__]
+
+# artifact manager
+this.compiler = ArtifactManager()
+
+def get_memory_pool(init_pool_size=0, max_pool_size=2**34):
+    this.memory_pool = PoolMemoryManager(
+        init_pool_size=init_pool_size,
+        max_pool_size=max_pool_size
+    )
+    return this.memory_pool
--- a/tools/library/scripts/pycutlass/src/pycutlass/arguments.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/arguments.py
@ -0,0 +1,104 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from .frontend import CupyFrontend
+from typeguard import typechecked
+from pycutlass.frontend import *
+from typing import Union
+import numpy as np
+from cuda import cuda
+try:
+    import torch
+    torch_available = True
+except ImportError:
+    torch_available = False
+from cuda import cudart
+try:
+    import cupy as cp
+    cupy_available = True
+except ImportError:
+    cupy_available = False
+
+
+# @typechecked
+class ArgumentBase:
+    """
+    Base class for operation arguments
+    """
+
+    def __init__(self,
+                 A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
+                 B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
+                 C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
+                 D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
+                 **kwargs) -> None:
+
+        # preprocessing input tensors
+        if isinstance(A, np.ndarray):
+            self.host_D = D
+            self.buffer_A = NumpyFrontend.argument(A, False)
+            self.buffer_B = NumpyFrontend.argument(B, False)
+            self.buffer_C = NumpyFrontend.argument(C, False)
+            self.buffer_D = NumpyFrontend.argument(D, True)
+            self.ptr_A = self.buffer_A.ptr
+            self.ptr_B = self.buffer_B.ptr
+            self.ptr_C = self.buffer_C.ptr
+            self.ptr_D = self.buffer_D.ptr
+        elif torch_available and isinstance(A, torch.Tensor):
+            self.ptr_A = TorchFrontend.argument(A)
+            self.ptr_B = TorchFrontend.argument(B)
+            self.ptr_C = TorchFrontend.argument(C)
+            self.ptr_D = TorchFrontend.argument(D)
+        elif isinstance(A, cuda.CUdeviceptr):
+            self.ptr_A = A
+            self.ptr_B = B
+            self.ptr_C = C
+            self.ptr_D = D
+        elif cupy_available and isinstance(A, cp.ndarray):
+            self.ptr_A = CupyFrontend.argument(A)
+            self.ptr_B = CupyFrontend.argument(B)
+            self.ptr_C = CupyFrontend.argument(C)
+            self.ptr_D = CupyFrontend.argument(D)
+        else:
+            raise TypeError(
+                "Unsupported Frontend. Only support numpy and torch")
+
+    def sync(self, stream_sync=True):
+        if stream_sync:
+            err, = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+        if hasattr(self, "host_D"):
+            err, = cuda.cuMemcpyDtoH(
+                self.host_D, self.ptr_D, self.host_D.size * self.host_D.itemsize)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
--- a/tools/library/scripts/pycutlass/src/pycutlass/c_types.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/c_types.py
@ -0,0 +1,252 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+from pycutlass.library import *
+
+# 12B
+
+
+class GemmCoord_(ctypes.Structure):
+    _fields_ = [
+        ("m", ctypes.c_int),
+        ("n", ctypes.c_int),
+        ("k", ctypes.c_int)
+    ]
+
+    def __init__(self, gemm_coord) -> None:
+        for field_name, _ in self._fields_:
+            setattr(self, field_name, getattr(gemm_coord, field_name)())
+
+
+class MatrixCoord_(ctypes.Structure):
+    _fields_ = [
+        ("row", ctypes.c_int),
+        ("column", ctypes.c_int)
+    ]
+
+
+dtype2ctype = {
+    cutlass.float16: ctypes.c_uint16,
+    cutlass.float32: ctypes.c_float,
+    cutlass.float64: ctypes.c_double,
+    cutlass.int32: ctypes.c_int32
+}
+
+
+def get_epilogue_output_op(element_compute_):
+    element_compute = dtype2ctype[element_compute_]
+
+    class _EpilogueOutputOpParams(ctypes.Structure):
+        _fields_ = [
+            ("alpha", element_compute),
+            ("beta", element_compute),
+            ("alpha_ptr", ctypes.c_void_p),
+            ("beta_ptr", ctypes.c_void_p)
+        ]
+    return _EpilogueOutputOpParams
+
+
+def get_gemm_arguments(element_compute_):
+
+    _EpilogueOutputOpParams = get_epilogue_output_op(element_compute_)
+
+    class _GemmArguments(ctypes.Structure):
+        _fields_ = [
+            ("mode", ctypes.c_int),
+            ("problem_size", GemmCoord_),
+            ("batch_count", ctypes.c_int),
+            ("epilogue", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("batch_stride_A", ctypes.c_longlong),
+            ("batch_stride_B", ctypes.c_longlong),
+            ("batch_stride_C", ctypes.c_longlong),
+            ("batch_stride_D", ctypes.c_longlong),
+            ("stride_a", ctypes.c_longlong),
+            ("stride_b", ctypes.c_longlong),
+            ("stride_c", ctypes.c_longlong),
+            ("stride_d", ctypes.c_longlong),
+            ("lda", ctypes.c_longlong),
+            ("ldb", ctypes.c_longlong),
+            ("ldc", ctypes.c_longlong),
+            ("ldd", ctypes.c_longlong),
+            ("ptr_gather_A_indices", ctypes.c_void_p),
+            ("ptr_gether_B_indices", ctypes.c_void_p),
+            ("ptr_scatter_D_indices", ctypes.c_void_p)
+        ]
+
+    return _GemmArguments, _EpilogueOutputOpParams
+
+
+###########################################################################################
+# GEMM Grouped
+###########################################################################################
+
+# include/cutlass/gemm/kernel/gemm_grouped.h
+
+def get_gemm_grouped_arguments(element_compute_):
+    _EpilogueOutputOpParams = get_epilogue_output_op(element_compute_)
+
+    class _GEMMGroupedArguments(ctypes.Structure):
+        _fields_ = [
+            ("problem_sizes", ctypes.c_void_p),
+            ("problem_count", ctypes.c_int),
+            ("threadblock_count", ctypes.c_int),
+            ("output_op", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("lda", ctypes.c_void_p),
+            ("ldb", ctypes.c_void_p),
+            ("ldc", ctypes.c_void_p),
+            ("ldd", ctypes.c_void_p),
+            ("host_problem_sizes", ctypes.c_void_p)
+        ]
+
+    return _GEMMGroupedArguments, _EpilogueOutputOpParams
+
+############################################################################################
+# Convolution2D
+############################################################################################
+
+
+# We use the arguments as the interface
+
+
+# include/cutlass/conv/conv2d_problem_size.h
+# 64B
+class Conv2DProblemSize(ctypes.Structure):
+    _fields_ = [
+        ("N", ctypes.c_int),
+        ("H", ctypes.c_int),
+        ("W", ctypes.c_int),
+        ("C", ctypes.c_int),
+        ("P", ctypes.c_int),
+        ("Q", ctypes.c_int),
+        ("K", ctypes.c_int),
+        ("R", ctypes.c_int),
+        ("S", ctypes.c_int),
+        ("pad_h", ctypes.c_int),
+        ("pad_w", ctypes.c_int),
+        ("stride_h", ctypes.c_int),
+        ("stride_w", ctypes.c_int),
+        ("dilation_h", ctypes.c_int),
+        ("dilation_w", ctypes.c_int),
+        ("mode", ctypes.c_int),  # kCrossCorrelation: 0, kConvolution: 1
+        ("split_k_slices", ctypes.c_int),
+        ("groups", ctypes.c_int)
+    ]
+
+    def __init__(self, problem_size) -> None:
+        for field_name, _ in self._fields_:
+            setattr(self, field_name, getattr(problem_size, field_name))
+
+
+# include/cutlass/layout/tensor.h
+# 12B
+class Layout4D(ctypes.Structure):
+    _fields_ = [
+        ("stride", ctypes.c_int * 3)
+    ]
+
+    def __init__(self, tensor_ref):
+        stride = tensor_ref.stride()
+        setattr(self, "stride", (stride.at(0), stride.at(1), stride.at(2)))
+
+# TODO: Tensor 5-D takes ("stride", ctypes.c_int * 4)
+
+
+# include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
+# TensorRef is basically cutlass::TensorRef<Element, Layout>;
+# include/cutlass/tensor_ref.h
+# 24B
+class TensorRef_(ctypes.Structure):
+    _fields_ = [
+        ("ptr", ctypes.c_void_p),
+        ("layout", Layout4D)
+    ]
+
+    def __init__(self, tensor_ref):
+        setattr(self, "ptr", tensor_ref.data())
+        setattr(self, "layout", Layout4D(tensor_ref.layout()))
+
+
+class TensorRef2D_(ctypes.Structure):
+    _fields_ = [
+        ("ptr", ctypes.c_void_p),
+        ("stride", ctypes.c_int)
+    ]
+
+
+# include/cutlass/conv/kernel/implicit_gemm_convolution.h
+# split_k_mode: kNone: 0, kSerial: 1, kParallel: 2, kParallelSerial: 3, kInvalid: 4
+
+def get_conv2d_arguments(element_compute_):
+    _EpilogueOutputOpParams = get_epilogue_output_op(element_compute_)
+
+    class _Conv2dArguments(ctypes.Structure):
+        _fields_ = [
+            ("problem_size", Conv2DProblemSize),  # 0
+            ("ref_A", TensorRef_),  # 72
+            ("ref_B", TensorRef_),  # 96
+            ("ref_C", TensorRef_),  # 120
+            ("ref_D", TensorRef_),  # 144
+            ("output_op", _EpilogueOutputOpParams),  # 168
+            ("split_k_mode", ctypes.c_int)  # 192
+        ]
+
+    return _Conv2dArguments, _EpilogueOutputOpParams
+
+
+############################################################################################
+# Reduction
+############################################################################################
+
+
+def get_reduction_params(element_compute_):
+    _EpilogueOutputParams = get_epilogue_output_op(element_compute_)
+
+    class _ReductionParams(ctypes.Structure):
+        _fields_ = [
+            ("problem_size", MatrixCoord_),
+            ("partitions", ctypes.c_int),
+            ("partition_stride", ctypes.c_longlong),
+            ("workspace", TensorRef2D_),
+            ("destination", TensorRef2D_),
+            ("source", TensorRef2D_),
+            ("output_op", _EpilogueOutputParams)
+        ]
+    return _ReductionParams, _EpilogueOutputParams
--- a/tools/library/scripts/pycutlass/src/pycutlass/cache.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/cache.py
@ -0,0 +1,366 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from pycutlass import *
+from pycutlass.library import SubstituteTemplate
+import cutlass
+from cuda import cuda
+from cuda import nvrtc
+import tempfile
+import os
+import ctypes
+
+# 
+import json
+import sqlite3
+
+
+IncludeTemplate = r'''#include "${include}"
+'''
+
+#
+class CompilationOptions:
+  '''
+  Compilation options.
+  '''
+
+  #
+  def __init__(self, architectures = [80], include_paths = []):
+    self.includes = []
+    self.include_paths = include_paths
+    self.flags = ['-std=c++11', '-default-device']
+    self.architectures = architectures
+
+  #
+  def get(self):
+    options = []
+
+    for flag in self.flags:
+      options.append(bytes(str.encode(flag)))
+
+    for incl in self.include_paths:
+      options.append(bytes(str.encode('--include-path=%s' % incl)))
+
+    arch_list = "-arch="
+    for idx, arch in enumerate(self.architectures):
+      if idx:
+        arch_list += ","
+      arch_list += "sm_%d" % arch
+
+    options.append(bytes(str.encode(arch_list)))
+
+    return options
+
+def convertToBinaryData(filename):
+    with open(filename, 'rb') as file:
+        blobData = file.read()
+    return blobData
+
+def CDLLBin(host_binary):
+    tempfile.tempdir = "./"
+    temp_so = tempfile.NamedTemporaryFile(prefix='host_func', suffix='.so', delete=True)
+    with open(temp_so.name, 'wb') as file:
+        file.write(host_binary)
+    host_lib = ctypes.CDLL(temp_so.name)
+    return host_lib
+
+
+class ArtifactManager:
+    """
+    Artifact manager
+    """
+    def __init__(self) -> None:
+        try:
+            connection = sqlite3.connect("./compiled_cache.db")
+            cursor = connection.cursor()
+            sqlite_create_table_query = """CREATE TABLE compiled_operations(op_key TEXT NOT NULL UNIQUE, cubin BLOB NOT NULL, hostbin BLOB NOT NULL, op_name TEXT NOT NULL, op_attrs TEXT NOT NULL)"""
+            cursor.execute(sqlite_create_table_query)
+            connection.commit()
+            cursor.close()
+        except:
+            pass
+
+        self.compiled_cache_device = cutlass.CompileCache()
+        self.compiled_cache_host = cutlass.CompileCache()
+    
+    def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
+        connection = sqlite3.connect("./compiled_cache.db")
+        cursor = connection.cursor()
+        sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
+
+        hostbin = convertToBinaryData(hostfile)
+
+        data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
+
+        cursor.execute(sqlite_insert_blob_query, data_tuple)
+        connection.commit()
+        cursor.close()
+    
+    def load_operation(self, op_key):
+        connection = sqlite3.connect("./compiled_cache.db")
+        cursor = connection.cursor()
+        sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
+        # try:
+        cursor.execute(sqlite_fetch_blob_query, (op_key, ))
+        record = cursor.fetchall()
+        if len(record) == 0:
+            return False
+        for row in record:
+            key, cubin_image, host_binary, operation_name, op_attr = row
+            op_attr = json.loads(op_attr)
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError('Cuda Error: {}'.format(err))
+            
+            err, kernel = cuda.cuModuleGetFunction(module, bytes(str.encode(operation_name)))
+            self.compiled_cache_device.insert(key, kernel)
+
+            compiled_host_fns = {}
+            host_lib = CDLLBin(host_binary)
+
+            func_name = operation_name + '_get_params'
+            func = getattr(host_lib, func_name)
+            func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
+            compiled_host_fns['get_args'] = func
+
+            func_name = operation_name + '_shared_memory_size'
+            func = getattr(host_lib, func_name)
+            compiled_host_fns['shared_memory_capacity'] = func()
+
+            for attr in op_attr:
+                if isinstance(attr, str):
+                    func_name = operation_name + '_' + attr
+                    func = getattr(host_lib, func_name)
+                    compiled_host_fns[attr] = func
+            
+            self.compiled_cache_host.insert(key, compiled_host_fns)
+        return True
+
+
+    def emit_compile_(self, operation_list, compilation_options):
+        """
+        Compile a list of kernels and store them into database
+        """
+        source_buffer_device = ""
+        source_buffer_host = ""
+        # 1. include
+        includes = []
+        for operation in operation_list:
+            for incl in operation.emitter.includes:
+                if incl not in includes:
+                    includes.append(incl)
+        
+        includes_host = [
+          "builtin_types.h", "device_launch_parameters.h", "stddef.h"] + includes
+        for incl in includes:
+            source_buffer_device += SubstituteTemplate(IncludeTemplate, {'include': incl})
+        
+        for incl in includes_host:
+            if "/device/" not in incl:
+                source_buffer_host += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
+            
+        
+        # 2. Operations
+        for operation in operation_list:
+            source_buffer_device += operation.emit()
+            source_buffer_host += operation.emit()
+            values = {
+                'operation_name': operation.name(),
+                'operation_suffix': operation.emitter.operation_suffix
+            }
+            source_buffer_device += SubstituteTemplate(operation.KernelTemplate, values)
+            source_buffer_host += SubstituteTemplate(operation.HostTemplate, values)
+        
+        # 3. compile
+        err, program = nvrtc.nvrtcCreateProgram(
+            str.encode(source_buffer_device), 
+            bytes(str.encode("module.cu")), 
+            0, [], [])
+
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError('NVRTC Error: {}'.format(err))
+
+        # Compile program
+        options = compilation_options.get()
+
+        err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+
+            error_string = 'NVRTC Error: {}\n'.format(err)
+
+            # Get log from compilation
+            err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError('NVRTC Error: {}'.format(err))
+            
+            log = b' ' * logSize
+            err, = nvrtc.nvrtcGetProgramLog(program, log)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError('NVRTC Error: {}'.format(err))
+            
+            raise RuntimeError(error_string + log.decode() + source_buffer_device)
+
+        # Get data from compilation
+        err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError('NVRTC Error: {}'.format(err))
+        
+        cubin_image = b' ' * dataSize
+        err, = nvrtc.nvrtcGetCUBIN(program, cubin_image)
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError('NVRTC Error: {}'.format(err))
+
+        # compile the host code
+        options = compilation_options.get()
+        cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
+        for opt in options:
+            opt = opt.decode("utf-8") 
+            if opt not in ['-default-device', '-std=c++11', '-arch=sm_80']:
+                if '--include-path=' in opt:
+                    cmd += " " + opt.replace('--include-path=', '-I')
+                else:
+                    cmd += " "+ opt
+
+        tempfile.tempdir = "./"
+        temp = tempfile.NamedTemporaryFile(prefix='host_func', suffix='.so', delete=True)
+
+        cmd += ' - -shared -o %s' % temp.name
+        os.system(cmd)
+        host_lib = ctypes.CDLL(temp.name)
+        
+        return cubin_image, host_lib, temp
+
+    
+    def add_module(self, operations, compile_options=None):
+        """
+        Insert a new compiled device module
+        """
+        if compile_options is None:
+            cutlass_path = os.getenv('CUTLASS_PATH')
+            assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
+            cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
+            assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
+            architectures = []
+            for operation in operations:
+                if hasattr(operation, "tile_description"):
+                    cc = operation.tile_description.minimum_compute_capability
+                    if cc not in architectures:
+                        architectures.append(cc)
+            include_paths = [
+                cuda_install_path + '/include',
+                cutlass_path + '/include',
+                cutlass_path + '/tools/util/include',
+            ]
+            compile_options = CompilationOptions(architectures, include_paths)
+        # save the cubin
+        operation_key = []
+        operation_list = []
+        for operation in operations:
+            # step 1: get kernel string as key
+            key = operation.rt_module.emit() + operation.procedural_name()
+            # step 1: check if the operation is in cache
+            compiled_kernel = self.compiled_cache_device.at(key)
+
+            if compiled_kernel is None:
+                hit = self.load_operation(key)
+                if hit:
+                    compiled_kernel = self.compiled_cache_device.at(key)
+                    assert compiled_kernel is not None
+            if compiled_kernel is not None:
+                operation.rt_module.kernel = compiled_kernel
+                compiled_host_fns = self.compiled_cache_host.at(key)
+                assert compiled_host_fns is not None
+                for key in compiled_host_fns.keys():
+                    setattr(operation.rt_module, key, compiled_host_fns[key])
+                operation.rt_module.initialize()
+            else:
+                operation_list.append(operation.rt_module)
+                operation_key.append(key)
+        if len(operation_list) > 0:
+            cubin_image, host_lib, host_file = self.emit_compile_(operation_list, compile_options)
+
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError('Cuda Error: {}'.format(err))
+            
+            operation_name = []
+            operation_attr = []
+            for operation, key in zip(operation_list, operation_key):
+                # get device kernels
+                err, operation.kernel = cuda.cuModuleGetFunction(
+                    module,
+                    bytes(str.encode(operation.name()))
+                )
+                operation_name.append(operation.name())
+                self.compiled_cache_device.insert(key, operation.kernel)
+                # get host functions
+                compiled_host_fns = {}
+                op_attr = []
+
+                # get param size
+                func_name = operation.name() + '_get_param_size'
+                func = getattr(host_lib, func_name)
+                param_size = func()
+
+                func_name = operation.name() + '_get_params'
+                func = getattr(host_lib, func_name)
+                func.argtype = operation.argtype
+                func.restype = ctypes.POINTER(ctypes.c_char * param_size)
+                setattr(operation, 'get_args', func)
+                compiled_host_fns['get_args'] = func
+            
+                # set shared memory size
+                func_name = operation.name() + '_shared_memory_size'
+                func = getattr(host_lib, func_name)
+                setattr(operation, 'shared_memory_capacity', func())
+                compiled_host_fns['shared_memory_capacity'] = func()
+                # set the maximum dynamic shared size
+                operation.initialize()
+
+                # get extra functions
+                op_attr.append(param_size)
+
+                if hasattr(operation, "extra_funcs"):
+                    for suffix in operation.extra_funcs:
+                        func_name = operation.name() + '_' + suffix
+                        func = getattr(host_lib, func_name)
+                        setattr(operation, suffix, func)
+                        compiled_host_fns[suffix] = func
+                        op_attr.append(suffix)
+                
+                operation_attr.append(op_attr)
+                self.compiled_cache_host.insert(key, compiled_host_fns)
+
+            for key, operation_name, operation_attr in zip(operation_key, operation_name, operation_attr):
+                self.insert_operation(key, cubin_image, host_file.name, operation_name, operation_attr)
+
+
+artifact_manager = ArtifactManager()
--- a/tools/library/scripts/pycutlass/src/pycutlass/compiler.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/compiler.py
@ -0,0 +1,430 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from pycutlass import *
+from pycutlass.library import SubstituteTemplate
+import cutlass
+from cuda import cuda
+from cuda import nvrtc
+import tempfile
+import os
+import ctypes
+
+#
+import json
+import sqlite3
+
+
+IncludeTemplate = r'''#include "${include}"
+'''
+
+#
+
+
+class CompilationOptions:
+    '''
+    Compilation options.
+    '''
+
+    #
+    def __init__(self, flags, architectures=[80], include_paths=[]):
+        self.includes = []
+        self.include_paths = include_paths
+        self.flags = flags
+        self.architectures = architectures
+
+    def get_str(self):
+        options = ""
+
+        for flag in self.flags:
+            options += " " + flag
+
+        for incl in self.include_paths:
+            options += ' --include-path=%s' % incl
+
+        arch_list = "-arch="
+        for idx, arch in enumerate(self.architectures):
+            if idx:
+                arch_list += ","
+            arch_list += "sm_%d" % arch
+
+        options += " " + arch_list
+        return options
+
+    #
+    def get(self):
+        options = []
+
+        for flag in self.flags:
+            options.append(bytes(str.encode(flag)))
+
+        for incl in self.include_paths:
+            options.append(bytes(str.encode('--include-path=%s' % incl)))
+
+        arch_list = "-arch="
+        for idx, arch in enumerate(self.architectures):
+            if idx:
+                arch_list += ","
+            arch_list += "sm_%d" % arch
+
+        options.append(bytes(str.encode(arch_list)))
+
+        return options
+
+
+def convertToBinaryData(filename):
+    with open(filename, 'rb') as file:
+        blobData = file.read()
+    return blobData
+
+
+def CDLLBin(host_binary):
+    tempfile.tempdir = "./"
+    temp_so = tempfile.NamedTemporaryFile(
+        prefix='host_func', suffix='.so', delete=True)
+    with open(temp_so.name, 'wb') as file:
+        file.write(host_binary)
+    host_lib = ctypes.CDLL(temp_so.name)
+    return host_lib
+
+
+class ArtifactManager:
+    """
+    Artifact manager
+    """
+
+    def __init__(self) -> None:
+        try:
+            connection = sqlite3.connect("./compiled_cache.db")
+            cursor = connection.cursor()
+            sqlite_create_table_query = """CREATE TABLE compiled_operations(op_key TEXT NOT NULL UNIQUE, cubin BLOB NOT NULL, hostbin BLOB NOT NULL, op_name TEXT NOT NULL, op_attrs TEXT NOT NULL)"""
+            cursor.execute(sqlite_create_table_query)
+            connection.commit()
+            cursor.close()
+        except:
+            pass
+
+        self.backend = "nvrtc"
+        self.default_compile_options = [
+            '-std=c++11', '-default-device',
+        ]
+        self.compiled_cache_device = cutlass.CompileCache()
+        self.compiled_cache_host = cutlass.CompileCache()
+
+    def nvcc(self):
+        self.backend = "nvcc"
+        self.default_compile_options = [
+            '-std=c++11',
+        ]
+    def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
+        connection = sqlite3.connect("./compiled_cache.db")
+        cursor = connection.cursor()
+        sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
+
+        hostbin = convertToBinaryData(hostfile)
+
+        data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
+
+        cursor.execute(sqlite_insert_blob_query, data_tuple)
+        connection.commit()
+        cursor.close()
+
+    def load_operation(self, op_key):
+        connection = sqlite3.connect("./compiled_cache.db")
+        cursor = connection.cursor()
+        sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
+        # try:
+        cursor.execute(sqlite_fetch_blob_query, (op_key, ))
+        record = cursor.fetchall()
+        if len(record) == 0:
+            return False
+        for row in record:
+            key, cubin_image, host_binary, operation_name, op_attr = row
+            op_attr = json.loads(op_attr)
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError('Cuda Error: {}'.format(err))
+
+            err, kernel = cuda.cuModuleGetFunction(
+                module, bytes(str.encode(operation_name)))
+            self.compiled_cache_device.insert(key, kernel)
+
+            compiled_host_fns = {}
+            host_lib = CDLLBin(host_binary)
+
+            func_name = operation_name + '_get_params'
+            func = getattr(host_lib, func_name)
+            func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
+            compiled_host_fns['get_args'] = func
+
+            func_name = operation_name + '_shared_memory_size'
+            func = getattr(host_lib, func_name)
+            compiled_host_fns['shared_memory_capacity'] = func()
+
+            for attr in op_attr:
+                if isinstance(attr, str):
+                    func_name = operation_name + '_' + attr
+                    func = getattr(host_lib, func_name)
+                    compiled_host_fns[attr] = func
+
+            self.compiled_cache_host.insert(key, compiled_host_fns)
+        return True
+
+    def emit_compile_(self, operation_list, compilation_options):
+        """
+        Compile a list of kernels and store them into database
+        """
+        source_buffer_device = ""
+        source_buffer_host = ""
+        # 1. include
+        includes = []
+        for operation in operation_list:
+            for incl in operation.emitter.includes:
+                if incl not in includes:
+                    includes.append(incl)
+
+        includes_host = [
+            "builtin_types.h", "device_launch_parameters.h", "stddef.h"] + includes
+        for incl in includes:
+            source_buffer_device += SubstituteTemplate(
+                IncludeTemplate, {'include': incl})
+
+        for incl in includes_host:
+            if "/device/" not in incl:
+                source_buffer_host += SubstituteTemplate(
+                    IncludeTemplate, {'include': incl})
+
+        # 2. Operations
+        for operation in operation_list:
+            source_buffer_device += operation.emit()
+            source_buffer_host += operation.emit()
+            values = {
+                'operation_name': operation.name(),
+                'operation_suffix': operation.emitter.operation_suffix
+            }
+            source_buffer_device += SubstituteTemplate(
+                operation.KernelTemplate, values)
+            source_buffer_host += SubstituteTemplate(
+                operation.HostTemplate, values)
+
+        if self.backend == "nvrtc":
+            # 3. compile
+            err, program = nvrtc.nvrtcCreateProgram(
+                str.encode(source_buffer_device),
+                bytes(str.encode("module.cu")),
+                0, [], [])
+
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError('NVRTC Error: {}'.format(err))
+
+            # Compile program
+            options = compilation_options.get()
+
+            err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+
+                error_string = 'NVRTC Error: {}\n'.format(err)
+
+                # Get log from compilation
+                err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError('NVRTC Error: {}'.format(err))
+
+                log = b' ' * logSize
+                err, = nvrtc.nvrtcGetProgramLog(program, log)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError('NVRTC Error: {}'.format(err))
+
+                raise RuntimeError(
+                    error_string + log.decode() + source_buffer_device)
+
+            # Get data from compilation
+            err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError('NVRTC Error: {}'.format(err))
+
+            cubin_image = b' ' * dataSize
+            err, = nvrtc.nvrtcGetCUBIN(program, cubin_image)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError('NVRTC Error: {}'.format(err))
+        else:  # with nvcc backend
+            # emit code
+            tempfile.tempdir = "./"
+            temp_cu = tempfile.NamedTemporaryFile(
+                prefix='kernel', suffix='.cu', delete=True)
+            temp_cubin = tempfile.NamedTemporaryFile(
+                prefix='kernel', suffix='.cubin', delete=True)
+            with open(temp_cu.name, 'w') as file:
+                file.write(source_buffer_device)
+
+            # compile with nvcc
+            cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
+            assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
+            cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
+            values = {
+                "cuda_install_path": cuda_install_path,
+                "options": compilation_options.get_str(),
+                "srcfile": temp_cu.name,
+                "tarfile": temp_cubin.name
+            }
+            cmd = SubstituteTemplate(cmd_template, values)
+            os.system(cmd)
+
+            # load the cubin image
+            with open(temp_cubin.name, 'rb') as file:
+                cubin_image = file.read()
+
+        # compile the host code
+        options = compilation_options.get()
+        cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
+        for opt in options:
+            opt = opt.decode("utf-8")
+            if opt not in ['-default-device', '-std=c++11', '-arch=sm_80', '-Xcicc', '-Xllc']:
+                if '--include-path=' in opt:
+                    cmd += " " + opt.replace('--include-path=', '-I')
+                else:
+                    cmd += " " + opt
+
+        tempfile.tempdir = "./"
+        temp = tempfile.NamedTemporaryFile(
+            prefix='host_func', suffix='.so', delete=True)
+
+        cmd += ' - -shared -o %s' % temp.name
+        os.system(cmd)
+        host_lib = ctypes.CDLL(temp.name)
+
+        return cubin_image, host_lib, temp
+
+    def add_module(self, operations, compile_options=None):
+        """
+        Insert a new compiled device module
+        """
+        if compile_options is None:
+            cutlass_path = os.getenv('CUTLASS_PATH')
+            assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
+            cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
+            assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
+            architectures = []
+            for operation in operations:
+                if hasattr(operation, "tile_description"):
+                    cc = operation.tile_description.minimum_compute_capability
+                    if cc not in architectures:
+                        architectures.append(cc)
+            include_paths = [
+                cuda_install_path + '/include',
+                cutlass_path + '/include',
+                cutlass_path + '/tools/util/include',
+            ]
+            compile_options = CompilationOptions(
+                self.default_compile_options, architectures, include_paths)
+        # save the cubin
+        operation_key = []
+        operation_list = []
+        for operation in operations:
+            # step 1: get kernel string as key
+            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
+            # step 1: check if the operation is in cache
+            compiled_kernel = self.compiled_cache_device.at(key)
+
+            if compiled_kernel is None:
+                hit = self.load_operation(key)
+                if hit:
+                    compiled_kernel = self.compiled_cache_device.at(key)
+                    assert compiled_kernel is not None
+            if compiled_kernel is not None:
+                operation.rt_module.kernel = compiled_kernel
+                compiled_host_fns = self.compiled_cache_host.at(key)
+                assert compiled_host_fns is not None
+                for key in compiled_host_fns.keys():
+                    setattr(operation.rt_module, key, compiled_host_fns[key])
+                operation.rt_module.initialize()
+            else:
+                operation_list.append(operation.rt_module)
+                operation_key.append(key)
+        if len(operation_list) > 0:
+            cubin_image, host_lib, host_file = self.emit_compile_(
+                operation_list, compile_options)
+
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError('Cuda Error: {}'.format(err))
+
+            operation_name = []
+            operation_attr = []
+            for operation, key in zip(operation_list, operation_key):
+                # get device kernels
+                err, operation.kernel = cuda.cuModuleGetFunction(
+                    module,
+                    bytes(str.encode(operation.name()))
+                )
+                operation_name.append(operation.name())
+                self.compiled_cache_device.insert(key, operation.kernel)
+                # get host functions
+                compiled_host_fns = {}
+                op_attr = []
+
+                # get param size
+                func_name = operation.name() + '_get_param_size'
+                func = getattr(host_lib, func_name)
+                param_size = func()
+
+                func_name = operation.name() + '_get_params'
+                func = getattr(host_lib, func_name)
+                func.argtype = operation.argtype
+                func.restype = ctypes.POINTER(ctypes.c_char * param_size)
+                setattr(operation, 'get_args', func)
+                compiled_host_fns['get_args'] = func
+
+                # set shared memory size
+                func_name = operation.name() + '_shared_memory_size'
+                func = getattr(host_lib, func_name)
+                setattr(operation, 'shared_memory_capacity', func())
+                compiled_host_fns['shared_memory_capacity'] = func()
+                # set the maximum dynamic shared size
+                operation.initialize()
+
+                # get extra functions
+                op_attr.append(param_size)
+
+                if hasattr(operation, "extra_funcs"):
+                    for suffix in operation.extra_funcs:
+                        func_name = operation.name() + '_' + suffix
+                        func = getattr(host_lib, func_name)
+                        setattr(operation, suffix, func)
+                        compiled_host_fns[suffix] = func
+                        op_attr.append(suffix)
+
+                operation_attr.append(op_attr)
+                self.compiled_cache_host.insert(key, compiled_host_fns)
+
+            for key, operation_name, operation_attr in zip(operation_key, operation_name, operation_attr):
+                self.insert_operation(
+                    key, cubin_image, host_file.name, operation_name, operation_attr)
--- a/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py
@ -0,0 +1,645 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+from typeguard import typechecked
+from cuda import cuda
+from typing import Union
+import numpy as np
+
+from typeguard import typechecked
+
+from pycutlass import *
+
+
+# @typechecked
+class Conv2dArguments(ArgumentBase):
+    """
+    Argument wrapper for Conv2d. It encodes problem information and 
+    user-provide tensors into the kernel's argument.
+
+    :param operation: the Conv2d operation to take the argument
+    :type operation: :class:`pycutlass.Conv2dOperation`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param split_k_mode: conv2d split K mode, defaults to 
+    cutlass.conv.SplitKMode.Serial
+    :type split_k_mode: cutlass.conv.SplitKMode, optional
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`pycutlass.LinearCombinationFunctorArguments`
+
+    """
+
+    def __init__(self, operation: 'Conv2dOperation',
+                 problem_size: 'cutlass.conv.Conv2dProblemSize',
+                 A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
+                 B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
+                 C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
+                 D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
+                 split_k_mode: 'cutlass.conv.SplitKMode'
+                    = cutlass.conv.SplitKMode.Serial, **kwargs) -> None:
+
+        #: convolution kind
+        self.conv_kind: cutlass.conv.Operator = operation.conv_kind
+        self.layout_A: cutlass.layout = operation.A.layout
+        self.layout_B: cutlass.layout = operation.B.layout
+        self.layout_C: cutlass.layout = operation.C.layout
+
+        self.element_A = operation.A.element
+        self.element_B = operation.B.element
+        self.element_C = operation.C.element
+
+        if self.layout_C == cutlass.TensorNC32HW32:
+            B = self.reorder_tensor_B(B, problem_size)
+
+        super().__init__(A, B, C, D, **kwargs)
+        # preprocessing output ops
+        if "output_op" in kwargs.keys() and \
+            split_k_mode != cutlass.conv.SplitKMode.Parallel:
+            self.alpha = kwargs["output_op"].alpha
+            self.beta = kwargs["output_op"].beta
+        else:
+            self.alpha = 1.0
+            self.beta = 0.0
+
+        self.element_compute = operation.element_epilogue
+
+        if "split_k_slices" in kwargs.keys():
+            self.split_k_mode = split_k_mode
+            self.split_k_slices = kwargs["split_k_slices"]
+        else:
+            self.split_k_mode = cutlass.conv.SplitKMode.Serial
+            self.split_k_slices = 1
+
+        #: problem_size
+        self.problem_size: cutlass.conv.Conv2dProblemSize = problem_size
+        self.problem_size.split_k_slices = self.split_k_slices
+
+        self.operation = operation
+
+        #
+        # initialize the argument
+        #
+        self.initialize()
+
+    # @typechecked
+    def reorder_tensor_B(self, tensor_B: 'np.ndarray', 
+            problem_size: 'cutlass.conv.Conv2dProblemSize'):
+        """
+        Reorder tensor_B for interleaved layout
+
+        :param tensor_B: input tensor B
+        :type tensor_B: numpy.ndarray
+        :param problem_size: Conv2d problem size
+        :type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
+
+        :return: reordered tensor B
+        :rtype: numpy.ndarray
+        """
+        reordered_tensor_B = np.empty_like(tensor_B)
+        tensor_ref_B = self.get_tensor_ref(
+            tensor_B, self.element_B, self.layout_B, problem_size, "b")
+        reordered_tensor_ref_B = self.get_tensor_ref(
+            reordered_tensor_B, self.element_B, 
+            self.layout_B, problem_size, "b")
+        cutlass.conv.host.reorder_convK(
+            reordered_tensor_ref_B, tensor_ref_B, self.conv_kind, problem_size)
+
+        return reordered_tensor_B
+
+    def get_tensor_ref(
+        self, tensor, dtype, tensor_layout, problem_size, operand):
+        if operand == "a":
+            tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(
+                self.conv_kind, problem_size)
+        elif operand == "b":
+            tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(
+                self.conv_kind, problem_size)
+        elif operand in ["c", "d"]:
+            tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
+                self.conv_kind, problem_size)
+        else:
+            raise ValueError("unknown operand: " + operand)
+
+        layout = tensor_layout.packed(tensor_coord)
+
+        return TensorRef(tensor, dtype, layout).tensor_ref
+
+    def get_arguments(self, semaphore):
+        ref_A = TensorRef_(self.get_tensor_ref(
+            self.ptr_A, self.element_A, self.layout_A, self.problem_size, "a"))
+        ref_B = TensorRef_(self.get_tensor_ref(
+            self.ptr_B, self.element_B, self.layout_B, self.problem_size, "b"))
+        ref_C = TensorRef_(self.get_tensor_ref(
+            self.ptr_C, self.element_C, self.layout_C, self.problem_size, "c"))
+        ref_D = TensorRef_(self.get_tensor_ref(
+            self.ptr_D, self.element_C, self.layout_C, self.problem_size, "d"))
+
+        if self.element_compute == cutlass.float16:
+            alpha = cutlass.float16(self.alpha).storage
+            beta = cutlass.float16(self.beta).storage
+        elif self.element_compute == cutlass.int32:
+            alpha = int(self.alpha)
+            beta = int(self.beta)
+        else:
+            alpha = self.alpha
+            beta = self.beta
+
+        argument_type, epilogue_type = get_conv2d_arguments(
+            self.operation.element_epilogue)
+
+        output_op = epilogue_type(alpha, beta, 0, 0)
+
+        self.c_arguments = argument_type(
+            Conv2DProblemSize(self.problem_size),
+            ref_A, ref_B, ref_C, ref_D, output_op, self.split_k_mode
+        )
+
+        self.semaphore = semaphore
+
+    def initialize(self):
+        """
+        Initialize the kernel arguments handling following stuffs
+        1. get kernel launch configuration including grid, cta size, 
+           and dynamic shared memory capacity
+        2. allocate and initialize device workspace
+        3. get kernel params as bytearray for NVRTC input
+        """
+        # get launch configuration
+        self.launch_config = self.operation.rt_module.plan(self)
+
+        # allocate and initialize device workspace
+        device_workspace_size = \
+            self.operation.rt_module.get_device_workspace_size(self)
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        # get kernel params as bytearray
+        semaphore = 0
+        if workspace_ptr is not None and \
+            self.split_k_mode == cutlass.conv.SplitKMode.Parallel:
+            self.ptr_D = workspace_ptr
+        elif workspace_ptr is not None and \
+            self.split_k_mode == cutlass.conv.SplitKMode.Serial:
+            semaphore = workspace_ptr
+
+        self.get_arguments(semaphore)
+
+        params_ = self.operation.rt_module.get_args(ctypes.byref(
+            self.c_arguments), ctypes.c_void_p(int(self.semaphore)))
+        self.host_workspace = bytearray(params_.contents)
+        self.device_workspace = None
+
+    def sync(self):
+        """
+        Synchronize the arguments. If the input tensor is in host, 
+        copy it from device to host.
+        """
+        return super().sync()
+
+
+# @typechecked
+class Conv2dRT(ExecutableOperation):
+    """
+    Conv2dRT manages the CUTLASS runtime components
+    """
+    KernelTemplate = r'''
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+    '''
+
+    HostTemplate = r'''
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Arguments* arguments, int *semaphore=nullptr){
+    typename ${operation_name}${operation_suffix}::Params* params;
+    params = new ${operation_name}${operation_suffix}::Params(*arguments, semaphore);
+
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+}
+
+    '''
+
+    def __init__(self, operation: 'Conv2dOperation'):
+        super().__init__(operation)
+
+        self.argtype = [ctypes.POINTER(get_conv2d_arguments(
+            operation.element_epilogue)[0]), ctypes.c_void_p]
+        self.conv_kind = operation.conv_kind
+
+        self.operation: Conv2dOperation = operation
+
+        self.emitter = EmitConv2dInstance('_type')
+
+        self.threads: int = operation.tile_description.num_threads
+
+        self.swizzle_functor = operation.swizzling_functor
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    # @typechecked
+    def get_device_workspace_size(self, arguments: Conv2dArguments):
+        workspace_bytes = 0
+
+        launch_config = arguments.launch_config
+
+        self.conv_kind = self.operation.conv_kind
+
+        if arguments.split_k_mode == cutlass.conv.SplitKMode.Parallel:
+            problem_size = arguments.problem_size
+            workspace_bytes = DataTypeSize[self.operation.C.element] \
+            * launch_config.grid[2] * cutlass.conv.implicit_gemm_tensor_c_size(
+                self.conv_kind, problem_size
+            ) // 8
+        elif arguments.split_k_mode == cutlass.conv.SplitKMode.Serial and \
+            arguments.split_k_slices > 1:
+            workspace_bytes = launch_config.grid[0] * launch_config.grid[1] * 4
+
+        return workspace_bytes
+
+    # @typechecked
+    def plan(self, arguments: Conv2dArguments):
+        tile_size = cutlass.gemm.GemmCoord(
+            self.operation.tile_description.threadblock_shape[0],
+            self.operation.tile_description.threadblock_shape[1],
+            self.operation.tile_description.threadblock_shape[2]
+        )
+
+        grid = self.swizzle_functor.get_grid_shape(
+            self.swizzle_functor.get_tiled_shape(
+                self.conv_kind, arguments.problem_size, 
+                tile_size, arguments.split_k_slices
+            )
+        )
+        return LaunchConfiguration(
+            [grid.x, grid.y, grid.z], [self.threads, 1, 1], 
+            self.shared_memory_capacity)
+
+    def initialize(self):
+        err, = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError('Cuda Error: {}'.format(err))
+
+#
+
+
+class Conv2dOperation:
+    """
+    CUTLASS Conv2d operation description.
+
+    :param conv_kind: convolution operator
+    :type conv_kind: :class:`cutlass.conv.Operator`
+
+    :param iterator_algorithm: Selects among several implementation 
+    variants trading off performance with simplicity
+    :type iterator_algorithm: :class:`cutlass.conv.IteratorAlgorithm`
+
+    :param arch: GPU compute capability (sm_xx)
+    :type arch: int
+
+    :param tile_description: tile description
+    :type tile_description: :class:`pycutlass.TileDescription`
+
+    :param A: tensor A description
+    :type A: :class:`pycutlass.TensorDescription`
+
+    :param B: tensor B description
+    :type B: :class:`pycutlass.TensorDescription`
+
+    :param C: tensor C description
+    :type C: :class:`pycutlass.TensorDescription`
+
+    :param D: tensor D description
+    :type D: :class:`pycutlass.TensorDescription`
+
+    :param element_epilogue: element type for computation in epilogue \
+    :type element_epilogue: cutlass.int8 | cutlass.int32 | cutlass.float16 | \
+    cutlass.bfloat16 | cutlass.float32 | cutlass.float64
+
+    :param stride_support: distinguish among partial specializations that \
+    accelerate certain problems where convolution stride is unit \
+    :type stride_support: :class:`cutlass.conv.StrideSupport`
+
+    :param epilogue_functor: convolution epilogue functor
+    :type epilogue_functor: :class:`EpilogueFunctor`
+
+    :param swizzling_functor: threadblock swizzling functor
+    """
+    #
+
+    def __init__(self,
+                 conv_kind: cutlass.conv.Operator,
+                 iterator_algorithm: cutlass.conv.IteratorAlgorithm,
+                 arch: int, tile_description: TileDescription,
+                 A: TensorDescription, B: TensorDescription, C: TensorDescription,
+                 element_epilogue: Union[cutlass.int8, cutlass.int32, cutlass.float16,
+                                         cutlass.bfloat16, cutlass.float32, cutlass.float64],
+                 stride_support, epilogue_functor=EpilogueFunctor.LinearCombination,
+                 swizzling_functor=cutlass.IdentitySwizzle1):
+
+        self.operation_kind: OperationKind = OperationKind.Conv2d
+        self.arch: int = arch
+        self.tile_description: TileDescription = tile_description
+        self.conv_kind = conv_kind
+        self.A: TensorDescription = A
+        self.B: TensorDescription = B
+        self.C: TensorDescription = C
+        self.element_epilogue = element_epilogue
+        self.epilogue_functor = epilogue_functor
+        self.iterator_algorithm = iterator_algorithm
+        self.stride_support = stride_support
+        self.swizzling_functor = swizzling_functor()
+
+        self.rt_module: Conv2dRT = Conv2dRT(self)
+
+    def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
+        """
+        Launch the cuda kernel with input arguments
+
+        :param arguments: conv2d arguments
+        :type arguments: :class:`pycutlass.Conv2dArguments`
+        """
+
+        # launch the kernel
+        err = self.rt_module.run(
+            arguments.host_workspace,
+            arguments.device_workspace,
+            arguments.launch_config)
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError('CUDA Error %s' % str(err))
+
+        return err
+
+    #
+    # Get function name
+    #
+
+    def procedural_name(self):
+        ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+        return self.configuration_name()
+    #
+
+    def configuration_name(self):
+        ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+
+        opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+        threadblock = "%dx%d_%dx%d" % (
+            self.tile_description.threadblock_shape[0],
+            self.tile_description.threadblock_shape[1],
+            self.tile_description.threadblock_shape[2],
+            self.tile_description.stages
+        )
+
+        if self.stride_support == StrideSupport.Unity:
+            configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
+        else:
+            configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
+
+        return SubstituteTemplate(
+            configuration_name,
+            {
+                'opcode_class': opcode_class_name,
+                'extended_name': self.extended_name(),
+                'threadblock': threadblock,
+                'layout': self.layout_name(),
+                'alignment': "%d" % self.A.alignment,
+            }
+        )
+
+    #
+    def extended_name(self):
+        ''' Append data types if they differ from compute type. '''
+        if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+                self.A.element != self.tile_description.math_instruction.element_accumulator:
+            extended_name = "${element_c}_${core_name}_${element_a}"
+        elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+                self.A.element != self.tile_description.math_instruction.element_accumulator:
+            extended_name = "${core_name}_${element_a}"
+        else:
+            extended_name = "${core_name}"
+
+        extended_name = SubstituteTemplate(extended_name, {
+            'element_a': DataTypeNames[self.A.element],
+            'element_c': DataTypeNames[self.C.element],
+            'core_name': self.core_name()
+        })
+
+        return extended_name
+
+    #
+    def layout_name(self):
+        return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+    #
+    def core_name(self):
+        ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+        intermediate_type = ''
+
+        if self.tile_description.math_instruction.opcode_class == cutlass.OpClass.TensorOp:
+            inst_shape = "%d%d%d" % tuple(
+                self.tile_description.math_instruction.instruction_shape)
+            if self.tile_description.math_instruction.element_a != self.A.element and \
+                    self.tile_description.math_instruction.element_a != self.accumulator_type():
+                intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+        else:
+            inst_shape = ''
+
+        return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()],
+                                inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+
+    #
+    def is_complex(self):
+        complex_operators = [
+            MathOperation.multiply_add_complex,
+            MathOperation.multiply_add_complex_gaussian
+        ]
+        return self.tile_description.math_instruction.math_operation in complex_operators
+
+    #
+    def accumulator_type(self):
+        accum = self.tile_description.math_instruction.element_accumulator
+
+        if self.is_complex():
+            return get_complex_from_real(accum)
+
+        return accum
+
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv2dInstance:
+    def __init__(self, operation_suffix=''):
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/conv/kernel/default_conv2d_fprop.h",
+            "cutlass/conv/kernel/default_conv2d_dgrad.h",
+            "cutlass/conv/kernel/default_conv2d_wgrad.h"
+        ]
+        self.template = """
+// Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+using ${operation_name}_base = 
+typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+  ${element_a}, 
+  ${layout_a},
+  ${element_b}, 
+  ${layout_b},
+  ${element_c}, 
+  ${layout_c},
+  ${element_accumulator},
+  ${opcode_class},
+  ${arch},
+  cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+  cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+  cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+  ${epilogue_functor}<
+    ${element_c},
+    ${epilogue_vector_length},
+    ${element_accumulator},
+    ${element_epilogue}
+  >,
+  ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+  ${stages},
+  ${math_operator},
+  ${iterator_algorithm},
+  ${stride_support},
+  ${align_a},
+  ${align_b}
+>::Kernel;
+
+struct ${operation_name}${operation_suffix}:
+  public ${operation_name}_base { };
+
+"""
+
+    def emit(self, operation):
+
+        warp_shape = [int(operation.tile_description.threadblock_shape[idx] /
+                          operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+        epilogue_vector_length = int(min(
+            operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+        values = {
+            'operation_name': operation.procedural_name(),
+            'operation_suffix': self.operation_suffix,
+            'conv_kind': ConvKindTag[operation.conv_kind],
+            'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+            'element_a': DataTypeTag[operation.A.element],
+            'layout_a': LayoutTag[operation.A.layout],
+            'element_b': DataTypeTag[operation.B.element],
+            'layout_b': LayoutTag[operation.B.layout],
+            'element_c': DataTypeTag[operation.C.element],
+            'layout_c': LayoutTag[operation.C.layout],
+            'element_accumulator': DataTypeTag[operation.accumulator_type()],
+            'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            'arch': "cutlass::arch::Sm%d" % operation.arch,
+            'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+            'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+            'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+            'warp_shape_m': str(warp_shape[0]),
+            'warp_shape_n': str(warp_shape[1]),
+            'warp_shape_k': str(warp_shape[2]),
+            'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+            'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+            'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+            'epilogue_vector_length': str(epilogue_vector_length),
+            'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+            'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+            'swizzling_functor': operation.swizzling_functor.tag(),
+            'stages': str(operation.tile_description.stages),
+            'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+            'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+            'stride_support': StrideSupportTag[operation.stride_support],
+            'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else
+            MathOperationTag[operation.tile_description.math_instruction.math_operation],
+            'align_a': str(operation.A.alignment),
+            'align_b': str(operation.B.alignment),
+        }
+
+        return SubstituteTemplate(self.template, values)
--- a/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py
@ -0,0 +1,138 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+import struct
+
+
+def MaxAlignment(fmt):
+    align = 1
+    for x in fmt:
+        align = max(align, struct.calcsize(x))
+    return align
+
+
+def AlignedOffset(offset, align):
+    remainder = (offset % align)
+    if remainder:
+        offset += (align - remainder)
+    return offset
+
+#################################################################################################
+#
+# Functors
+#
+#################################################################################################
+
+#
+
+
+class Functor:
+    def __init__(self):
+        self.decl = ''
+        self.definition = ''
+        self.fmt = ''
+        self.identifier = ''
+
+    #
+    def emit_declaration(self):
+        return self.decl
+
+    #
+    def emit_definition(self):
+        return self.definition
+
+    #
+    def size(self):
+        '''
+        Size of the packed Params structure
+        '''
+        return struct.calcsize(self.fmt)
+
+    #
+    def alignment(self):
+        return MaxAlignment(self.fmt)
+
+    #
+    def initialize(self, host_workspace, offset, arguments):
+        return offset + self.size()
+
+#################################################################################################
+
+#
+
+
+class LinearCombinationFunctorArguments:
+    def __init__(self, alpha=1.0, beta=0.0):
+        self.alpha = alpha
+        self.beta = beta
+        self.alpha_ptr = 0
+        self.beta_ptr = 0
+
+#
+
+
+class LinearCombinationFunctor(Functor):
+    def __init__(self):
+        super().__init__()
+
+        self.decl = """
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >"""
+        self.identifier = 'linear_combination'
+        self.fmt = "ffPP"
+
+    #
+    def size(self):
+        '''
+        Size of the packed Params structure
+        '''
+        return struct.calcsize(self.fmt)
+
+    #
+    def alignment(self):
+        return MaxAlignment(self.fmt)
+
+    #
+    def initialize(self, host_workspace, offset, arguments):
+
+        offset = AlignedOffset(offset, self.alignment())
+
+        struct.pack_into(
+            self.fmt,
+            host_workspace, offset,
+            arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
+
+        return offset + self.size()
--- a/tools/library/scripts/pycutlass/src/pycutlass/frontend.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/frontend.py
@ -0,0 +1,104 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+import numpy as np
+from cuda import cuda
+from pycutlass.memory_manager import *
+from typing import TYPE_CHECKING
+try:
+    import torch
+    torch_available = True
+except ImportError:
+    torch_available = False
+    if TYPE_CHECKING:
+        import torch
+
+try:
+    import cupy as cp
+    cupy_available = True
+except ImportError:
+    cupy_available = False
+    if TYPE_CHECKING:
+        import cupy as cp
+
+
+class NumpyFrontend:
+    """
+    Frontend node for numpy
+    """
+
+    @staticmethod
+    def argument(np_tensor: 'np.ndarray', is_output: 'bool') -> cuda.CUdeviceptr:
+        """Convert the input numpy tensor to CUDA device pointer
+
+        :param np_tensor: input numpy nd array
+        :param is_output: whether the tensor is output
+
+        :return: CUDA device pointer
+        """
+        # copy the data to device
+        if is_output:
+            return device_mem_alloc(np_tensor.size * np_tensor.itemsize)
+        else:
+            return todevice(np_tensor)
+
+
+class TorchFrontend:
+    """
+    Frontend node for torch
+    """
+
+    @staticmethod
+    def argument(torch_tensor: 'torch.Tensor') -> cuda.CUdeviceptr:
+        """Convert the input torch tensor to CUDA device pointer
+
+        :param torch_tensor: input torch tensor
+        :param is_output: whether the tensor is output
+
+        :return: CUDA device pointer
+        """
+
+        # check the device of torch_tensor
+        if not torch_tensor.is_cuda:
+            torch_tensor = torch_tensor.to("cuda")
+
+        return cuda.CUdeviceptr(torch_tensor.data_ptr())
+
+
+class CupyFrontend:
+    """
+    Frontend node for cupy
+    """
+
+    @staticmethod
+    def argument(cupy_ndarray: 'cp.ndarray'):
+        return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))
--- a/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py
--- a/tools/library/scripts/pycutlass/src/pycutlass/library.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/library.py
@ -0,0 +1,790 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import re
+
+###################################################################################################
+
+import enum
+import cutlass
+
+# The following block implements enum.auto() for Python 3.5 variants that don't include it such
+# as the default 3.5.2 on Ubuntu 16.04.
+#
+# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
+
+try:
+    from enum import auto as enum_auto
+except ImportError:
+    __cutlass_library_auto_enum = 0
+
+    def enum_auto() -> int:
+        global __cutlass_library_auto_enum
+        i = __cutlass_library_auto_enum
+        __cutlass_library_auto_enum += 1
+        return i
+
+###################################################################################################
+
+#
+
+
+class GeneratorTarget(enum.Enum):
+    Library = enum_auto()
+
+#
+GeneratorTargetNames = {
+    GeneratorTarget.Library: 'library', 
+}
+#
+
+###################################################################################################
+
+#
+ShortDataTypeNames = {
+    cutlass.int32: 'i',
+    cutlass.float16: 'h',
+    cutlass.float32: 's',
+    cutlass.float64: 'd',
+    cutlass.dtype.cf32: 'c',
+    cutlass.dtype.cf64: 'z',
+}
+
+#
+DataTypeNames = {
+    cutlass.dtype.b1: "b1",
+    cutlass.dtype.u4: "u4",
+    cutlass.dtype.u8: "u8",
+    cutlass.dtype.u16: "u16",
+    cutlass.dtype.u32: "u32",
+    cutlass.dtype.u64: "u64",
+    cutlass.dtype.s4: "s4",
+    cutlass.int8: "s8",
+    cutlass.dtype.s16: "s16",
+    cutlass.int32: "s32",
+    cutlass.dtype.s64: "s64",
+    cutlass.float16: "f16",
+    cutlass.bfloat16: "bf16",
+    cutlass.float32: "f32",
+    cutlass.tfloat32: "tf32",
+    cutlass.float64: "f64",
+    cutlass.dtype.cf16: "cf16",
+    cutlass.dtype.cbf16: "cbf16",
+    cutlass.dtype.cf32: "cf32",
+    cutlass.dtype.ctf32: "ctf32",
+    cutlass.dtype.cf64: "cf64",
+    cutlass.dtype.cu4: "cu4",
+    cutlass.dtype.cu8: "cu8",
+    cutlass.dtype.cu16: "cu16",
+    cutlass.dtype.cu32: "cu32",
+    cutlass.dtype.cu64: "cu64",
+    cutlass.dtype.cs4: "cs4",
+    cutlass.dtype.cs8: "cs8",
+    cutlass.dtype.cs16: "cs16",
+    cutlass.dtype.cs32: "cs32",
+    cutlass.dtype.cs64: "cs64",
+}
+
+DataTypeTag = {
+    cutlass.dtype.b1: "cutlass::uint1b_t",
+    cutlass.dtype.u2: "cutlass::uint2b_t",
+    cutlass.dtype.u4: "cutlass::uint4b_t",
+    cutlass.dtype.u8: "uint8_t",
+    cutlass.dtype.u16: "uint16_t",
+    cutlass.dtype.u32: "uint32_t",
+    cutlass.dtype.u64: "uint64_t",
+    cutlass.dtype.s2: "cutlass::int2b_t",
+    cutlass.dtype.s4: "cutlass::int4b_t",
+    cutlass.int8: "int8_t",
+    cutlass.dtype.s16: "int16_t",
+    cutlass.int32: "int32_t",
+    cutlass.dtype.s64: "int64_t",
+    cutlass.float16: "cutlass::half_t",
+    cutlass.bfloat16: "cutlass::bfloat16_t",
+    cutlass.float32: "float",
+    cutlass.tfloat32: "cutlass::tfloat32_t",
+    cutlass.float64: "double",
+    cutlass.dtype.cf16: "cutlass::complex<cutlass::half_t>",
+    cutlass.dtype.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
+    cutlass.dtype.cf32: "cutlass::complex<float>",
+    cutlass.dtype.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
+    cutlass.dtype.cf64: "cutlass::complex<double>",
+    cutlass.dtype.cu2: "cutlass::complex<cutlass::uint2b_t>",
+    cutlass.dtype.cu4: "cutlass::complex<cutlass::uint4b_t>",
+    cutlass.dtype.cu8: "cutlass::complex<cutlass::uint8_t>",
+    cutlass.dtype.cu16: "cutlass::complex<cutlass::uint16_t>",
+    cutlass.dtype.cu32: "cutlass::complex<cutlass::uint32_t>",
+    cutlass.dtype.cu64: "cutlass::complex<cutlass::uint64_t>",
+    cutlass.dtype.cs2: "cutlass::complex<cutlass::int2b_t>",
+    cutlass.dtype.cs4: "cutlass::complex<cutlass::int4b_t>",
+    cutlass.dtype.cs8: "cutlass::complex<cutlass::int8_t>",
+    cutlass.dtype.cs16: "cutlass::complex<cutlass::int16_t>",
+    cutlass.dtype.cs32: "cutlass::complex<cutlass::int32_t>",
+    cutlass.dtype.cs64: "cutlass::complex<cutlass::int64_t>",
+}
+
+DataTypeSize = {
+    cutlass.dtype.b1: 1,
+    cutlass.dtype.u4: 4,
+    cutlass.dtype.u8: 8,
+    cutlass.dtype.u16: 16,
+    cutlass.dtype.u32: 32,
+    cutlass.dtype.u64: 64,
+    cutlass.dtype.s4: 4,
+    cutlass.int8: 8,
+    cutlass.dtype.s16: 16,
+    cutlass.int32: 32,
+    cutlass.dtype.s64: 64,
+    cutlass.float16: 16,
+    cutlass.bfloat16: 16,
+    cutlass.float32: 32,
+    cutlass.tfloat32: 32,
+    cutlass.float64: 64,
+    cutlass.dtype.cf16: 32,
+    cutlass.dtype.cbf16: 32,
+    cutlass.dtype.cf32: 64,
+    cutlass.dtype.ctf32: 32,
+    cutlass.dtype.cf64: 128,
+    cutlass.dtype.cu4: 8,
+    cutlass.dtype.cu8: 16,
+    cutlass.dtype.cu16: 32,
+    cutlass.dtype.cu32: 64,
+    cutlass.dtype.cu64: 128,
+    cutlass.dtype.cs4: 8,
+    cutlass.dtype.cs8: 16,
+    cutlass.dtype.cs16: 32,
+    cutlass.dtype.cs32: 64,
+    cutlass.dtype.cs64: 128,
+}
+
+###################################################################################################
+#
+
+
+class BlasMode(enum.Enum):
+    symmetric = enum_auto()
+    hermitian = enum_auto()
+
+
+#
+BlasModeTag = {
+    BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
+    BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
+}
+
+#
+ComplexTransformTag = {
+    cutlass.complex_transform.none: 'cutlass::ComplexTransform::kNone',
+    cutlass.complex_transform.conj: 'cutlass::ComplexTransform::kConjugate',
+}
+
+#
+RealComplexBijection = [
+    (cutlass.float16, cutlass.dtype.cf16),
+    (cutlass.float32, cutlass.dtype.cf32),
+    (cutlass.float64, cutlass.dtype.cf64),
+]
+
+#
+
+
+def is_complex(data_type):
+    for r, c in RealComplexBijection:
+        if data_type == c:
+            return True
+    return False
+
+#
+
+
+def get_complex_from_real(real_type):
+    for r, c in RealComplexBijection:
+        if real_type == r:
+            return c
+    return cutlass.dtype.invalid
+
+#
+
+
+def get_real_from_complex(complex_type):
+    for r, c in RealComplexBijection:
+        if complex_type == c:
+            return r
+    return cutlass.dtype.invalid
+
+#
+
+
+class ComplexMultiplyOp(enum.Enum):
+    multiply_add = enum_auto()
+    gaussian = enum_auto()
+
+###################################################################################################
+
+#
+
+
+class MathOperation(enum.Enum):
+    multiply_add = enum_auto()
+    multiply_add_saturate = enum_auto()
+    xor_popc = enum_auto()
+    multiply_add_fast_bf16 = enum_auto()
+    multiply_add_fast_f16 = enum_auto()
+    multiply_add_fast_f32 = enum_auto()
+    multiply_add_complex_fast_f32 = enum_auto()
+    multiply_add_complex = enum_auto()
+    multiply_add_complex_gaussian = enum_auto()
+
+
+#
+MathOperationNames = {
+    MathOperation.multiply_add: 'multiply_add',
+    MathOperation.multiply_add_saturate: 'multiply_add_saturate',
+    MathOperation.xor_popc: 'xor_popc',
+    MathOperation.multiply_add_fast_bf16: 'multiply_add_fast_bf16',
+    MathOperation.multiply_add_fast_f16: 'multiply_add_fast_f16',
+    MathOperation.multiply_add_fast_f32: 'multiply_add_fast_f32',
+    MathOperation.multiply_add_complex_fast_f32: 'multiply_add_complex_fast_f32',
+    MathOperation.multiply_add_complex: 'multiply_add_complex',
+    MathOperation.multiply_add_complex_gaussian: 'multiply_add_complex_gaussian',
+}
+
+#
+MathOperationTag = {
+    MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
+    MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
+    MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
+    MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
+    MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
+    MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
+    MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
+    MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
+    MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
+}
+
+###################################################################################################
+
+#
+LayoutTag = {
+    cutlass.ColumnMajor: 'cutlass::layout::ColumnMajor',
+    cutlass.RowMajor: 'cutlass::layout::RowMajor',
+    cutlass.layout.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
+    cutlass.layout.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
+    cutlass.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
+    cutlass.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
+    cutlass.layout.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
+    cutlass.layout.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
+    cutlass.TensorNHWC: 'cutlass::layout::TensorNHWC',
+    cutlass.layout.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
+    cutlass.layout.TensorNCHW: 'cutlass::layout::TensorNCHW',
+    cutlass.layout.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
+    cutlass.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
+    cutlass.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
+    cutlass.layout.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
+    cutlass.layout.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
+}
+
+#
+TransposedLayout = {
+    cutlass.ColumnMajor: cutlass.RowMajor,
+    cutlass.RowMajor: cutlass.ColumnMajor,
+    cutlass.layout.ColumnMajorInterleaved2: cutlass.layout.RowMajorInterleaved2,
+    cutlass.layout.RowMajorInterleaved2: cutlass.layout.ColumnMajorInterleaved2,
+    cutlass.ColumnMajorInterleaved32: cutlass.RowMajorInterleaved32,
+    cutlass.RowMajorInterleaved32: cutlass.ColumnMajorInterleaved32,
+    cutlass.layout.ColumnMajorInterleaved64: cutlass.layout.RowMajorInterleaved64,
+    cutlass.layout.RowMajorInterleaved64: cutlass.layout.ColumnMajorInterleaved64,
+    cutlass.TensorNHWC: cutlass.TensorNHWC
+}
+
+#
+ShortLayoutTypeNames = {
+    cutlass.ColumnMajor: 'n',
+    cutlass.layout.ColumnMajorInterleaved2: 'n2',
+    cutlass.ColumnMajorInterleaved32: 'n32',
+    cutlass.layout.ColumnMajorInterleaved64: 'n64',
+    cutlass.RowMajor: 't',
+    cutlass.layout.RowMajorInterleaved2: 't2',
+    cutlass.RowMajorInterleaved32: 't32',
+    cutlass.layout.RowMajorInterleaved64: 't64',
+    cutlass.TensorNHWC: 'nhwc',
+    cutlass.layout.TensorNDHWC: 'ndhwc',
+    cutlass.layout.TensorNCHW: 'nchw',
+    cutlass.layout.TensorNGHWC: 'nghwc',
+    cutlass.TensorNC32HW32: 'nc32hw32',
+    cutlass.layout.TensorNC64HW64: 'nc64hw64',
+    cutlass.TensorC32RSK32: 'c32rsk32',
+    cutlass.layout.TensorC64RSK64: 'c64rsk64'
+}
+
+#
+ShortComplexLayoutNames = {
+    (cutlass.ColumnMajor, cutlass.complex_transform.none): 'n',
+    (cutlass.ColumnMajor, cutlass.complex_transform.conj): 'c',
+    (cutlass.RowMajor, cutlass.complex_transform.none): 't',
+    (cutlass.RowMajor, cutlass.complex_transform.conj): 'h'
+}
+
+###################################################################################################
+
+#
+
+
+class SideMode(enum.Enum):
+    Left = enum_auto()
+    Right = enum_auto()
+
+
+#
+SideModeTag = {
+    SideMode.Left: 'cutlass::SideMode::kLeft',
+    SideMode.Right: 'cutlass::SideMode::kRight'
+}
+
+#
+ShortSideModeNames = {
+    SideMode.Left: 'ls',
+    SideMode.Right: 'rs'
+}
+
+###################################################################################################
+
+#
+
+
+class FillMode(enum.Enum):
+    Lower = enum_auto()
+    Upper = enum_auto()
+
+
+#
+FillModeTag = {
+    FillMode.Lower: 'cutlass::FillMode::kLower',
+    FillMode.Upper: 'cutlass::FillMode::kUpper'
+}
+
+#
+ShortFillModeNames = {
+    FillMode.Lower: 'l',
+    FillMode.Upper: 'u'
+}
+
+###################################################################################################
+
+#
+
+
+class DiagType(enum.Enum):
+    NonUnit = enum_auto()
+    Unit = enum_auto()
+
+
+#
+DiagTypeTag = {
+    DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
+    DiagType.Unit: 'cutlass::DiagType::kUnit'
+}
+
+#
+ShortDiagTypeNames = {
+    DiagType.NonUnit: 'nu',
+    DiagType.Unit: 'un'
+}
+
+###################################################################################################
+
+OpcodeClassNames = {
+    cutlass.OpClass.Simt: 'simt',
+    cutlass.OpClass.TensorOp: 'tensorop',
+    cutlass.OpClass.WmmaTensorOp: 'wmma_tensorop',
+    cutlass.OpClass.SparseTensorOp: 'sptensorop'
+}
+
+OpcodeClassTag = {
+    cutlass.OpClass.Simt: 'cutlass::arch::OpClassSimt',
+    cutlass.OpClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
+    cutlass.OpClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
+    cutlass.OpClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp'
+}
+
+###################################################################################################
+
+#
+
+
+class OperationKind(enum.Enum):
+    Gemm = enum_auto()
+    RankK = enum_auto()
+    Rank2K = enum_auto()
+    Trmm = enum_auto()
+    Symm = enum_auto()
+    Conv2d = enum_auto()
+    Conv3d = enum_auto()
+
+
+#
+OperationKindNames = {
+    OperationKind.Gemm: 'gemm', OperationKind.RankK: 'rank_k', OperationKind.Rank2K: 'rank_2k', OperationKind.Trmm: 'trmm', OperationKind.Symm: 'symm', OperationKind.Conv2d: 'conv2d', OperationKind.Conv3d: 'conv3d'
+}
+
+#
+ArchitectureNames = {
+    50: 'maxwell',
+    60: 'pascal',
+    61: 'pascal',
+    70: 'volta',
+    75: 'turing',
+    80: 'ampere',
+}
+
+#
+SharedMemPerCC = {
+    70: 96,  # 96KB of SMEM
+    72: 96,  # 96KB of SMEM
+    75: 64,  # 64KB of SMEM
+    80: 160,  # 164KB of SMEM - 4KB reserved for the driver
+    86: 100,  # 100KB of SMEM
+    87: 160,  # 164KB of SMEM - 4KB reserved for the driver
+}
+
+###################################################################################################
+
+#
+
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+###################################################################################################
+
+#
+
+
+class GemmKind(enum.Enum):
+    Gemm = enum_auto()
+    Sparse = enum_auto()
+    Universal = enum_auto()
+    PlanarComplex = enum_auto()
+    PlanarComplexArray = enum_auto()
+    Grouped = enum_auto()
+
+
+#
+GemmKindNames = {
+    GemmKind.Gemm: "gemm",
+    GemmKind.Sparse: "spgemm",
+    GemmKind.Universal: "gemm",
+    GemmKind.PlanarComplex: "gemm_planar_complex",
+    GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
+    GemmKind.Grouped: "gemm_grouped"
+}
+
+#
+
+
+class RankKKind(enum.Enum):
+    Universal = enum_auto()
+
+
+#
+RankKKindNames = {
+    RankKKind.Universal: "rank_k"
+}
+
+#
+
+
+class TrmmKind(enum.Enum):
+    Universal = enum_auto()
+
+
+#
+TrmmKindNames = {
+    TrmmKind.Universal: "trmm"
+}
+
+#
+
+
+class SymmKind(enum.Enum):
+    Universal = enum_auto()
+
+
+#
+SymmKindNames = {
+    SymmKind.Universal: "symm"
+}
+
+#
+
+
+class EpilogueFunctor(enum.Enum):
+    LinearCombination = enum_auto()
+    LinearCombinationClamp = enum_auto()
+    FastLinearCombinationClamp = enum_auto()
+
+
+#
+EpilogueFunctorTag = {
+    EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination',
+    EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp',
+    EpilogueFunctor.FastLinearCombinationClamp: 'cutlass::epilogue::thread::FastLinearCombinationClamp'
+}
+
+#
+
+
+class SwizzlingFunctor(enum.Enum):
+    Identity1 = enum_auto()
+    Identity2 = enum_auto()
+    Identity4 = enum_auto()
+    Identity8 = enum_auto()
+    Horizontal = enum_auto()
+    BatchedIdentity1 = enum_auto()
+    StridedDgradIdentity1 = enum_auto()
+    StridedDgradIdentity4 = enum_auto()
+    StridedDgradHorizontal = enum_auto()
+
+
+#
+SwizzlingFunctorTag = {
+    cutlass.IdentitySwizzle1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
+    SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
+    SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
+    SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
+    SwizzlingFunctor.Horizontal: 'cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle',
+    SwizzlingFunctor.BatchedIdentity1: "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle",
+    SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
+    SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
+    SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
+}
+
+#
+
+
+class SchedulerMode(enum.Enum):
+    Device = enum_auto(),
+    Host = enum_auto()
+
+
+#
+SchedulerModeTag = {
+    SchedulerMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
+    SchedulerMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
+}
+
+#
+ShortSchedulerModeNames = {
+    SchedulerMode.Device: 'Device',
+    SchedulerMode.Host: 'Host'
+}
+
+###################################################################################################
+
+
+#
+ConvKindTag = {
+    cutlass.conv.Operator.fprop: 'cutlass::conv::Operator::kFprop',
+    cutlass.conv.Operator.dgrad: 'cutlass::conv::Operator::kDgrad',
+    cutlass.conv.Operator.wgrad: 'cutlass::conv::Operator::kWgrad'
+}
+
+ConvKindNames = {
+    cutlass.conv.Operator.fprop: 'fprop',
+    cutlass.conv.Operator.dgrad: 'dgrad',
+    cutlass.conv.Operator.wgrad: 'wgrad',
+}
+
+
+#
+IteratorAlgorithmTag = {
+    cutlass.conv.IteratorAlgorithm.analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
+    cutlass.conv.IteratorAlgorithm.optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
+    cutlass.conv.IteratorAlgorithm.fixed_channels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
+    cutlass.conv.IteratorAlgorithm.few_channels: 'cutlass::conv::IteratorAlgorithm::kFewChannels'
+}
+
+IteratorAlgorithmNames = {
+    cutlass.conv.IteratorAlgorithm.analytic: 'analytic',
+    cutlass.conv.IteratorAlgorithm.optimized: 'optimized',
+    cutlass.conv.IteratorAlgorithm.fixed_channels: 'fixed_channels',
+    cutlass.conv.IteratorAlgorithm.few_channels: 'few_channels'
+}
+
+#
+
+
+class StrideSupport(enum.Enum):
+    Strided = enum_auto()
+    Unity = enum_auto()
+
+
+#
+StrideSupportTag = {
+    StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
+    StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
+}
+
+StrideSupportNames = {
+    StrideSupport.Strided: '',
+    StrideSupport.Unity: 'unity_stride',
+}
+
+
+class ConvMode(enum.Enum):
+    CrossCorrelation = enum_auto()
+    Convolution = enum_auto()
+
+
+#
+ConvModeTag = {
+    ConvMode.CrossCorrelation: 'cutlass::conv::Mode::kCrossCorrelation',
+    ConvMode.Convolution: 'cutlass::conv::Mode::kConvolution'
+}
+
+###################################################################################################
+
+#
+
+
+class MathInstruction:
+    def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class=cutlass.OpClass.Simt, math_operation=MathOperation.multiply_add):
+        self.instruction_shape = instruction_shape
+        self.element_a = element_a
+        self.element_b = element_b
+        self.element_accumulator = element_accumulator
+        self.opcode_class = opcode_class
+        self.math_operation = math_operation
+
+#
+
+
+class TileDescription:
+
+    def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute):
+        self.threadblock_shape = threadblock_shape
+
+        #: number of pipeline stages
+        self.stages: int = stages
+
+        #: number of warps along x, y, z directions
+        self.warp_count: list[int] = warp_count
+        self.math_instruction = math_instruction
+
+        #: minimum compute capability
+        self.minimum_compute_capability: int = min_compute
+        #: maximum compute capability
+        self.maximum_compute_capability: int = max_compute
+
+        #: number threads per threadblock
+        self.num_threads: int = 32
+        for cnt in self.warp_count:
+            self.num_threads *= cnt
+
+    def procedural_name(self):
+        return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+
+#
+
+
+class TensorDescription:
+    def __init__(self, element, layout, alignment=1, complex_transform=cutlass.complex_transform.none):
+        self.element = element
+        self.layout = layout
+        self.alignment = min(128 // DataTypeSize[self.element], alignment)
+        self.complex_transform = complex_transform
+
+#
+
+
+class SymmetricTensorDescription:
+    def __init__(self, element, layout, fill_mode, alignment=1, complex_transform=cutlass.complex_transform.none, side_mode=SideMode.Left):
+        self.element = element
+        self.layout = layout
+        self.fill_mode = fill_mode
+        self.alignment = alignment
+        self.complex_transform = complex_transform
+        self.side_mode = side_mode
+
+#
+
+
+class TriangularTensorDescription:
+    def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment=1, complex_transform=cutlass.complex_transform.none):
+        self.element = element
+        self.layout = layout
+        self.side_mode = side_mode
+        self.fill_mode = fill_mode
+        self.diag_type = diag_type
+        self.alignment = alignment
+        self.complex_transform = complex_transform
+
+###################################################################################################
+
+#
+
+
+def CalculateSmemUsage(operation):
+    cta_shape = operation.tile_description.threadblock_shape
+    stages = operation.tile_description.stages
+
+    if operation.operation_kind == OperationKind.Gemm and operation.gemm_kind == GemmKind.Sparse:
+        # Elements represented by 8 bits of metadata (based on 4:8, 2:4 or 1:2 sparsity)
+        if DataTypeSize[operation.A.element] == 32:
+            elements_per_8b_md = 2
+        elif DataTypeSize[operation.A.element] == 4:
+            elements_per_8b_md = 8
+        else:
+            elements_per_8b_md = 4
+
+        smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * (cta_shape[2] // 2) // 8 + \
+            DataTypeSize[operation.B.element] * cta_shape[1] * cta_shape[2] // 8 + \
+            cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
+    else:
+        # Few BLAS3 operations only have A tensor
+        smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * cta_shape[2] // 8 + \
+            DataTypeSize[operation.A.element] * \
+            cta_shape[1] * cta_shape[2] // 8
+
+    smem_usage = smem_per_stage * stages
+    return (smem_usage >> 10)
+###################################################################################################
--- a/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py
@ -0,0 +1,74 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import rmm
+import numpy as np
+
+
+class PoolMemoryManager:
+    def __init__(self, init_pool_size: int, max_pool_size: int) -> None:
+        self.pool = rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(),
+            initial_pool_size=init_pool_size,
+            maximum_pool_size=max_pool_size
+        )
+        self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
+        rmm.mr.set_current_device_resource(self.mr)
+
+    def get_allocated_size(self):
+        return self.mr.get_allocated_bytes()
+
+    def pool_size(self):
+        return self.pool.pool_size()
+
+
+def todevice(host_data, dtype=np.float32):
+    """
+    Pass the host_data to device memory
+    """
+    if isinstance(host_data, list):
+        return rmm.DeviceBuffer.to_device(np.array(host_data, dtype=dtype).tobytes())
+    elif isinstance(host_data, np.ndarray):
+        return rmm.DeviceBuffer.to_device(host_data.tobytes())
+
+
+def device_mem_alloc(size):
+    return rmm.DeviceBuffer(size=size)
+
+
+def align_size(size, alignment=256):
+    return ((size + alignment - 1) // alignment) * alignment
+
+
+def get_allocated_size():
+    device_resource = rmm.mr.get_current_device_resource()
+    return device_resource.get_allocated_bytes()
--- a/tools/library/scripts/pycutlass/src/pycutlass/operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/operation.py
@ -0,0 +1,110 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+import ctypes
+from cuda import cuda
+
+################################################################################
+#
+# Launch configuration
+#
+################################################################################
+
+
+class LaunchConfiguration:
+    def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
+        self.grid = grid
+        self.block = block
+        self.shared_memory_capacity = smem
+
+
+################################################################################
+#
+# Base class for an executable operation
+#
+# ##############################################################################
+
+class ExecutableOperation:
+    '''
+    '''
+
+    def __init__(self, operation):
+        self.operation = operation
+        self.module = None
+        self.kernel = None
+
+    #
+    def name(self):
+        return self.operation.procedural_name()
+
+    #
+    def emit(self):
+        return ''
+
+    #
+    def can_implement(self, configuration, arguments):
+        raise NotImplementedError()
+
+    #
+    def get_host_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    #
+    def get_device_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    #
+    def plan(self, arguments):
+        raise NotImplementedError()
+
+    #
+    def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=cuda.CUstream(0)):
+        raise NotImplementedError()
+
+    #
+    def run(self, host_workspace, device_workspace, launch_config, stream=cuda.CUstream(0)):
+
+        cArg = (ctypes.c_char * len(host_workspace)
+                ).from_buffer(host_workspace)
+        packed = (ctypes.c_void_p * 1)()
+        packed[0] = ctypes.addressof(cArg)
+
+        err, = cuda.cuLaunchKernel(
+            self.kernel,
+            launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
+            launch_config.block[0], launch_config.block[1], launch_config.block[2],
+            launch_config.shared_memory_capacity,
+            stream,
+            packed,
+            0)
+
+        return err
--- a/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py
@ -0,0 +1,402 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+from pycutlass import *
+from pycutlass.c_types import get_reduction_params
+import cutlass
+from cuda import cuda
+try:
+    import torch
+    torch_available = True
+except ImportError:
+    torch_available = False
+import numpy as np
+from typing import Union
+from cuda import cudart
+
+
+class ReductionOperation:
+    pass
+
+
+class ReductionArguments:
+    """
+    Arguments of reduction
+    """
+
+    def __init__(self, operation: ReductionOperation,
+                 problem_size: 'list[int]', partitions: int,
+                 workspace: cuda.CUdeviceptr,
+                 destination: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
+                 source: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]', **kwargs) -> None:
+
+        self.operation = operation
+        #: pointer to the workspace
+        self.ptr_workspace = workspace
+
+        #: number of split-k partitions
+        self.partitions = partitions
+
+        if isinstance(destination, np.ndarray):
+            self.host_D = destination
+            self.destination_buffer = NumpyFrontend.argument(destination, True)
+            self.source_buffer = NumpyFrontend.argument(source, False)
+            self.ptr_destination = cuda.CUdeviceptr(
+                self.destination_buffer.ptr)
+            self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
+        elif torch_available and isinstance(destination, torch.Tensor):
+            self.ptr_destination = TorchFrontend.argument(destination)
+            self.ptr_source = TorchFrontend.argument(source)
+        elif isinstance(destination, cuda.CUdeviceptr):
+            self.ptr_destination = destination
+            self.ptr_source = source
+        else:
+            raise TypeError("unknown Type")
+
+        self.problem_size = MatrixCoord_(
+            problem_size[0], problem_size[1]
+        )
+
+        self.partition_stride = problem_size[0] * \
+            problem_size[1] * DataTypeSize[operation.C.element] // 8
+
+        if "output_op" in kwargs.keys():
+            self.alpha = kwargs["output_op"].alpha
+            self.beta = kwargs["output_op"].beta
+        else:
+            self.alpha = 1.0
+            self.beta = 0.0
+
+        # get arguments
+        self.get_arguments()
+
+    @staticmethod
+    def get_tensor_ref(extent: 'tuple[int]', device_ptr: cuda.CUdeviceptr, layout: cutlass.layout):
+        if layout == cutlass.RowMajor:
+            return TensorRef2D_(int(device_ptr), extent[1])
+        else:
+            raise ValueError("unknonwn layout type")
+
+    def get_arguments(self):
+        ref_workspace = ReductionArguments.get_tensor_ref(
+            extent=[self.problem_size.row, self.problem_size.column],
+            device_ptr=self.ptr_workspace, layout=cutlass.RowMajor)
+
+        ref_source = ReductionArguments.get_tensor_ref(
+            extent=[self.problem_size.row, self.problem_size.column],
+            device_ptr=self.ptr_source, layout=cutlass.RowMajor)
+
+        ref_destination = ReductionArguments.get_tensor_ref(
+            extent=[self.problem_size.row, self.problem_size.column],
+            device_ptr=self.ptr_destination, layout=cutlass.RowMajor)
+
+        argument_type, epilogue_type = get_reduction_params(
+            self.operation.element_compute)
+
+        if self.operation.element_compute == cutlass.float16:
+            self.alpha = cutlass.float16(self.alpha).storage
+            self.beta = cutlass.float16(self.beta).storage
+        elif self.operation.element_compute == cutlass.int32:
+            self.alpha = int(self.alpha)
+            self.beta = int(self.beta)
+
+        output_op = epilogue_type(self.alpha, self.beta, 0, 0)
+        self.c_arguments = argument_type(
+            self.problem_size, self.partitions,
+            self.partition_stride, ref_workspace,
+            ref_destination, ref_source,
+            output_op
+        )
+
+        params_ = self.operation.rt_module.get_args(
+            ctypes.byref(self.c_arguments))
+        self.host_workspace = bytearray(params_.contents)
+
+    def sync(self):
+        err, = cudart.cudaDeviceSynchronize()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+
+        if hasattr(self, "host_D"):
+            err, = cuda.cuMemcpyDtoH(
+                self.host_D, self.ptr_destination, self.host_D.size * self.host_D.itemsize)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+    def free(self):
+        if hasattr(self, "destination_buffer"):
+            del self.destination_buffer
+        if hasattr(self, "source_buffer"):
+            del self.source_buffer
+
+
+class ReductionRT(ExecutableOperation):
+    """
+    ReductionRT manages the CUTLASS runtime components for reduction
+    """
+    KernelTemplate = r'''
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+    '''
+    HostTemplate = r'''
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+}
+    '''
+
+    def __init__(self, operation: ReductionOperation):
+        super().__init__(operation)
+
+        self.operation: ReductionOperation = operation
+        self.emitter = EmitReductionInstance('_type')
+
+        self.elements_per_access = self.operation.count
+        self.argtype = [ctypes.POINTER(
+            get_reduction_params(operation.element_compute)[0])]
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    def plan(self, arguments: ReductionArguments):
+        block_shape = [self.operation.shape.column(
+        ) // self.elements_per_access, self.operation.shape.row(), 1]
+        grid_shape = [
+            (arguments.problem_size.row + self.operation.shape.row() -
+             1) // self.operation.shape.row(),
+            (arguments.problem_size.column + self.operation.shape.column() -
+                1) // self.operation.shape.column(),
+            1
+        ]
+        return LaunchConfiguration(grid_shape, block_shape, self.shared_memory_capacity)
+
+    def initialize(self):
+        err, = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError('Cuda Error: {}'.format(err))
+
+
+class ReductionOperation:
+    """
+    CUTLASS Reduction Operation
+    shape: shape of CTA
+    outputop: output operator
+    r
+    """
+
+    def __init__(self, shape: cutlass.MatrixCoord, C: TensorDescription,
+                 element_accumulator, element_workspace=None,
+                 element_compute=None, epilogue_functor: EpilogueFunctor = EpilogueFunctor.LinearCombination,
+                 count: int = 1, partitions_per_stage: int = 4) -> None:
+        """ Constructor
+        """
+
+        self.shape = shape
+        #: epilogue functor (default: LinearCombination)
+        self.epilogue_functor: EpilogueFunctor = epilogue_functor
+        #: datatype of accumulator
+        self.element_accumulator = element_accumulator
+
+        if element_workspace is None:
+            #: datatype of workspace
+            self.element_workspace = element_accumulator
+        else:
+            #: datatype of workspace
+            self.element_workspace = element_workspace
+
+        if element_compute is None:
+            #: datatype of workspace
+            self.element_compute = element_accumulator
+        else:
+            #: datatype of workspace
+            self.element_compute = element_compute
+
+        #: datatype of output
+        self.element_output = C.element
+
+        #: operand C
+        self.C: TensorDescription = C
+
+        #: reduce op processing size
+        self.count: int = count
+
+        #: number of partitions to reduce per stage
+        self.partitions_per_stage: int = partitions_per_stage
+
+        self.rt_module: ReductionRT = ReductionRT(self)
+
+    #
+    def extended_name(self):
+        extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
+
+        return SubstituteTemplate(extend_name,
+                                  {
+                                      'element_workspace': DataTypeNames[self.element_workspace],
+                                      'element_accumulator': DataTypeNames[self.element_accumulator],
+                                      'element_compute': DataTypeNames[self.element_compute],
+                                      'element_output': DataTypeNames[self.element_output]
+                                  })
+
+    #
+    def configuration_name(self):
+        ''' The full procedural name indicates architecture, extended name, tile size'''
+
+        configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
+
+        threadblock = "%dx%d" % (
+            self.shape.row(),
+            self.shape.column()
+        )
+
+        return SubstituteTemplate(
+            configuration_name,
+            {
+                'extended_name': self.extended_name(),
+                'threadblock': threadblock
+            }
+        )
+
+    #
+    def procedural_name(self):
+        ''' The full procedural name indicates architeture, extended name, tile size'''
+        return self.configuration_name()
+
+    def run(self, arguments: ReductionArguments) -> cuda.CUresult:
+        """
+        Configure and launch the cuda kernel with input arguments
+        """
+        # get launch configuration
+        launch_config = self.rt_module.plan(arguments)
+
+        # get the host and device workspace
+        host_workspace = arguments.host_workspace
+        device_workspace = None
+
+        # launch the kernel
+        err = self.rt_module.run(
+            host_workspace, device_workspace, launch_config)
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError('CUDA Error %s' % str(err))
+
+        return err
+
+
+class EmitReductionInstance:
+    def __init__(self, operation_suffix='') -> None:
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/numeric_types.h",
+            "cutlass/arch/arch.h",
+            "cutlass/arch/mma.h",
+            "cutlass/layout/matrix.h",
+            "cutlass/gemm/device/gemm.h",
+            "cutlass/gemm/device/gemm_universal_adapter.h",
+            "cutlass/gemm/kernel/default_gemm_universal.h",
+            "cutlass/reduction/kernel/reduce_split_k.h",
+            "cutlass/reduction/thread/reduction_operators.h"
+        ]
+        self.template = """
+// Reduction kernel instance
+using ${operation_name}_base = 
+typename cutlass::reduction::kernel::ReduceSplitK<
+  cutlass::MatrixShape<${shape_row}, ${shape_column}>,
+  ${epilogue_functor}<
+    ${element_output},
+    ${epilogue_vector_length},
+    ${element_accumulator},
+    ${element_compute}
+  >,
+  cutlass::reduction::thread::ReduceAdd<
+    ${element_accumulator},
+    ${element_output},
+    ${count}>,
+  ${partition_per_stage}>;
+
+struct ${operation_name}${operation_suffix}:
+  public ${operation_name}_base { };
+      """
+
+    def emit(self, operation: ReductionOperation):
+
+        epilogue_vector_length = int(min(
+            operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+        values = {
+            'operation_name': operation.configuration_name(),
+            'operation_suffix': self.operation_suffix,
+            'shape_row': str(operation.shape.row()),
+            'shape_column': str(operation.shape.column()),
+            'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+            'element_output': DataTypeTag[operation.element_output],
+            'epilogue_vector_length': str(epilogue_vector_length),
+            'element_accumulator': DataTypeTag[operation.element_accumulator],
+            'element_compute': DataTypeTag[operation.element_compute],
+            'element_workspace': DataTypeTag[operation.element_workspace],
+            'count': str(operation.count),
+            'partition_per_stage': str(operation.partitions_per_stage)
+        }
+
+        return SubstituteTemplate(self.template, values)
--- a/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py
@ -0,0 +1,71 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+from typeguard import typechecked
+import numpy as np
+try:
+    import torch
+    torch_available = True
+except ImportError:
+    torch_available = False
+from cuda import cuda
+try:
+    import cupy as cp
+    cupy_available = True
+except ImportError:
+    cupy_available = False
+import cutlass
+
+
+# @typechecked
+class TensorRef:
+    """
+    Python Wrapper for cutlass.TensorRef
+    """
+    def __init__(self, tensor, dtype, layout) -> None:
+        if isinstance(tensor, np.ndarray):
+            ptr = cuda.CUdeviceptr(tensor.__array_interface__['data'][0])
+        elif torch_available and isinstance(tensor, torch.Tensor):
+            ptr = cuda.CUdeviceptr(tensor.data_ptr())
+        elif cupy_available and isinstance(tensor, cp.ndarray):
+            ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
+        elif isinstance(tensor, cuda.CUdeviceptr):
+            ptr = tensor
+        elif isinstance(tensor, int):
+            ptr = cuda.CUdeviceptr(tensor)
+        else:
+            raise NotImplementedError(tensor)
+        
+        # the dtype(0) is used to overload between different data types 
+        # with the same layout
+        self.tensor_ref = cutlass.get_tensor_ref(int(ptr), dtype(0), layout)
+
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/init.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/init.py
@ -0,0 +1,4 @@
+from pycutlass.test.profiler import *
+from pycutlass.test.conv2d_testbed import *
+from pycutlass.test.gemm_testbed import *
+from pycutlass.test.gemm_grouped_testbed import *
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py
@ -0,0 +1,646 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+from time import sleep
+from bfloat16 import bfloat16
+import subprocess
+from typeguard import typechecked
+import re
+
+
+
+def getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand):
+    ptr = tensor.__array_interface__['data'][0]
+    if operand == "a":
+        tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
+    elif operand == "b":
+        tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
+    elif operand in ["c", "d"]:
+        tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
+    else:
+        raise ValueError("unknown operand: " + operand)
+    
+    layout = tensor_layout.packed(tensor_coord)
+
+    if tensor.dtype == np.float64:
+        return cutlass.TensorRefF64NHWC(ptr, layout)
+    elif tensor.dtype == np.float32:
+        return cutlass.TensorRefF32NHWC(ptr, layout)
+    elif tensor.dtype == np.float16:
+        return cutlass.TensorRefF16NHWC(ptr, layout)
+    if tensor.dtype == bfloat16:
+        return cutlass.TensorRefBF16NHWC(ptr, layout)
+    elif tensor.dtype == np.int32:
+        return cutlass.TensorRefS32NHWC(ptr, layout)
+    elif tensor.dtype == np.int8:
+        if tensor_layout == cutlass.TensorNC32HW32:
+            return cutlass.TensorRefS8NC32HW32(ptr, layout)
+        elif tensor_layout == cutlass.TensorC32RSK32:
+            return cutlass.TensorRefS8C32RSK32(ptr, layout)
+        else:
+            return cutlass.TensorRefS8NHWC(ptr, layout)
+    else:
+        raise ValueError("unsupported data type")
+
+def getTensorView(tensor, tensor_layout, conv_kind, problem_size, operand):
+    tensor_ref = getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand)
+
+    if operand == "a":
+        tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
+    elif operand == "b":
+        tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
+    elif operand in ["c", "d"]:
+        tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
+    else:
+        raise ValueError("unknown operand: " + operand)
+
+    if tensor.dtype == np.float64:
+        return cutlass.TensorViewF64NHWC(tensor_ref, tensor_coord)
+    elif tensor.dtype == np.float32:
+        return cutlass.TensorViewF32NHWC(tensor_ref, tensor_coord)
+    elif tensor.dtype == np.float16:
+        return cutlass.TensorViewF16NHWC(tensor_ref, tensor_coord)
+    elif tensor.dtype == bfloat16:
+        return cutlass.TensorViewBF16NHWC(tensor_ref, tensor_coord)
+    elif tensor.dtype == np.int32:
+        return cutlass.TensorViewS32NHWC(tensor_ref, tensor_coord)
+    elif tensor.dtype == np.int8:
+        if tensor_layout == cutlass.TensorNC32HW32:
+            return cutlass.TensorViewS8NC32HW32(tensor_ref, tensor_coord)
+        elif tensor_layout == cutlass.TensorC32RSK32:
+            return cutlass.TensorViewS8C32RSK32(tensor_ref, tensor_coord)
+        else:
+            return cutlass.TensorViewS8NHWC(tensor_ref, tensor_coord)
+        
+    else:
+        raise ValueError("unsupported data type")
+
+
+
+# @typechecked
+class Conv2dLauncher:
+    """
+    Launcher that runs the operation on given problem size
+    """
+    def __init__(self, operation: 'Conv2dOperation', seed: int=2080, interleaved=False,
+        verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
+
+        self.enable_cached_results = True
+        self.interleaved = interleaved
+
+        # create the reduction kernel
+        self.reduction_operation = ReductionOperation(
+            shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
+            C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+            element_compute=operation.element_epilogue, 
+            count=operation.C.alignment
+        )
+
+        #: verify the output result
+        self.verification = verification
+        #: profile the kernel's runtime
+        self.profiling = profiling
+
+        self.timer = GpuTimer()
+
+        self.warmup_iterations = warmup_iterations
+        self.iterations = iterations
+
+        if "sleep" in kwargs.keys():
+            self.sleep_time = kwargs["sleep"]
+        else:
+            self.sleep_time = 0
+        
+        #
+        # Compile the operator
+        #
+
+        pycutlass.compiler.add_module([operation, self.reduction_operation])
+
+        self.operation = operation
+
+        self.dtype_A = Conv2dLauncher.numpy_type(operation.A.element)
+        self.layout_A = operation.A.layout
+        self.dtype_B = Conv2dLauncher.numpy_type(operation.B.element)
+        self.layout_B = operation.B.layout
+        self.dtype_C = Conv2dLauncher.numpy_type(operation.C.element)
+        self.layout_C = operation.C.layout
+        self.dtype_D = Conv2dLauncher.numpy_type(operation.C.element)
+        self.layout_D = operation.C.layout
+
+        accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
+        element_size = DataTypeSize[operation.A.element]
+
+        if element_size <= 8:
+            self.scope = 1
+        elif element_size == 16:
+            if accumulator_size <= 16:
+                self.scope = 2
+            else:
+                self.scope = 4
+        else:
+            self.scope = 7
+
+        # Seed
+        self.seed = seed
+
+        self.conv_kind = operation.conv_kind
+        
+
+        #
+        # Get the host reference function
+        #
+
+        self.element_compute = operation.element_epilogue
+
+        self.host_conv2d = cutlass.test.conv.host.conv2d
+
+        self.timer = GpuTimer()
+
+    @staticmethod
+    def numpy_type(type):
+        if type == cutlass.float64:
+            return np.float64
+        elif type == cutlass.float32:
+            return np.float32
+        elif type == cutlass.float16:
+            return np.float16
+        elif type == cutlass.bfloat16:
+            return bfloat16
+        elif type == cutlass.int32:
+            return np.int32
+        elif type == cutlass.int8:
+            return np.int8
+        else:
+            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
+
+    def print_problem_size(self, p, split_k_mode=1):
+        print("nhwc_%dx%dx%dx%d_krsc_%dx%dx%dx%d_padding_%dx%d_stride_%dx%d_dilation_%dx%d_splitkslices_%d_splitkmode_%d"
+         % (p.N, p.H, p.W, p.C, p.K, p.R, p.S, p.C, p.pad_h,
+          p.pad_w, p.stride_h, p.stride_w, p.dilation_h, p.dilation_w, p.split_k_slices, split_k_mode))
+    
+    def uniform_init(self, size, dtype):
+        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
+            return np.ceil(
+                np.random.uniform(
+                    low=-self.scope - 0.5, high=self.scope - 0.5, 
+                    size=size).astype(dtype)
+                )
+        else:
+            return np.random.uniform(
+                low=-self.scope - 1, high=self.scope + 1, 
+                size=size).astype(dtype)
+    
+    def eq_gemm_size(self, problem_size):
+        n = problem_size.N
+        p = problem_size.P
+        q = problem_size.Q
+        k = problem_size.K
+        r = problem_size.R
+        s = problem_size.S
+        c = problem_size.C
+        h = problem_size.H
+        w = problem_size.W
+        if self.conv_kind == cutlass.conv.Operator.fprop:
+            return cutlass.gemm.GemmCoord(n * p * q, k, r * s * c)
+        elif self.conv_kind == cutlass.conv.Operator.dgrad:
+            return cutlass.gemm.GemmCoord(n * h * w, c, k * r * s)
+        else:
+            return cutlass.gemm.GemmCoord(k, r * s * c, n * p * q)
+    
+    def bytes(self, problem_size, alpha, beta):
+        mnk = self.eq_gemm_size(problem_size)
+
+        bytes_ = \
+            (DataTypeSize[self.operation.A.element] * mnk.m() // 8) * mnk.k() + \
+            (DataTypeSize[self.operation.B.element] * mnk.n() // 8) * mnk.k() + \
+            (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
+
+        if beta != 0:
+            bytes_ += (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
+        
+        return bytes_
+    
+    def flops(self, problem_size):
+        mnk = self.eq_gemm_size(problem_size)
+
+        flops_mainloop_ = mnk.m() * mnk.n() * mnk.k() * 2
+        flops_epilogue_ = mnk.m() * mnk.n() * 2
+
+        # Adjust mainloop flop for dgrad stride
+        if self.conv_kind == cutlass.conv.Operator.dgrad:
+            flops_mainloop_ = flops_mainloop_ // (problem_size.stride_h * problem_size.stride_w)
+        
+        flops_total_ = flops_mainloop_ + flops_epilogue_
+        
+        # TODO complex-value support
+        # switch (operation_desc.tile_description.math_instruction.math_operation) {
+        # case library::MathOperationID::kMultiplyAddComplex:
+        #     flops_total_ *=4;
+        #     break;
+
+        # default: break;
+        # }
+
+        return flops_total_
+
+
+    
+    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        if self.element_compute == cutlass.float16:
+            alpha = cutlass.float16(alpha)
+            beta = cutlass.float16(beta)
+        elif self.element_compute == cutlass.int32:
+            alpha = int(alpha)
+            beta = int(beta)
+        else:
+            alpha = alpha
+            beta = beta
+
+        # if cached result is loaded
+        cached_result_loaded = False
+
+        if self.enable_cached_results:
+            # get problem key
+            cached_test_key = cutlass.test.conv.host.CreateCachedConv2dTestKey(
+                self.conv_kind, problem_size, alpha, beta, 
+                getTensorView(tensor_A, self.layout_A, self.conv_kind, problem_size, "a"),
+                getTensorView(tensor_B, self.layout_B, self.conv_kind, problem_size, "b"),
+                getTensorView(tensor_C, self.layout_C, self.conv_kind, problem_size, "c"),
+            )
+
+            cached_test_result = cutlass.test.conv.host.CachedTestResult()
+
+            conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (self.operation.arch, self.seed)
+
+            cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
+            # CachedTestResultListing cached_results(conv2d_result_cache_name);
+            cached = cached_results.find(cached_test_key)
+            cached_result_loaded = cached[0]
+            if cached_result_loaded :
+                cached_test_result = cached[1]
+        
+        if not cached_result_loaded:
+            # compute the conv2d on host
+            tensor_D_ref = np.ones_like(tensor_C)
+            tensor_ref_A = getTensorRef(tensor_A, self.layout_A, self.conv_kind, problem_size, "a")
+            tensor_ref_B = getTensorRef(tensor_B, self.layout_B, self.conv_kind, problem_size, "b")
+            tensor_ref_C = getTensorRef(tensor_C, self.layout_C, self.conv_kind, problem_size, "c")
+            tensor_ref_D_ref = getTensorRef(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
+
+            self.host_conv2d(
+                self.conv_kind, problem_size, 
+                tensor_ref_A, tensor_ref_B, tensor_ref_C, tensor_ref_D_ref,
+                alpha, beta
+            )
+
+            tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
+
+            if self.enable_cached_results:
+                cached_test_result.D = cutlass.test.conv.host.TensorHash(tensor_view_D_ref)
+                cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
+                cached_results.append(cached_test_key, cached_test_result)
+                cached_results.write(conv2d_result_cache_name)
+            else:
+                return tensor_D_ref
+
+        return cached_test_result.D
+    
+    def equal(self, tensor_D, tensor_D_ref, problem_size):
+        if self.enable_cached_results:
+            tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
+            tensor_D_hash = cutlass.test.conv.host.TensorHash(tensor_view_D)
+
+            return tensor_D_hash == tensor_D_ref
+        else:
+            tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
+            tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
+            return cutlass.test.conv.host.equals(tensor_view_D, tensor_view_D_ref)
+    
+    def run_cutlass_profiler(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial, alpha=1.0, beta=0.0):
+
+        if split_k_mode == cutlass.conv.SplitKMode.Serial:
+            split_k_mode_ = "serial"
+        else:
+            split_k_mode_ = "parallel"
+
+        cutlass_path = os.getenv('CUTLASS_PATH')
+        assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
+
+        values = {
+            "profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
+            "kernel_name": self.operation.procedural_name(),
+            "verification_providers": "device",
+            "provider": "cutlass",
+            'n': str(problem_size.N),
+            'h': str(problem_size.H),
+            'w': str(problem_size.W),
+            'c': str(problem_size.C),
+            'k': str(problem_size.K),
+            'r': str(problem_size.R),
+            's': str(problem_size.S),
+            'p': str(problem_size.P),
+            'q': str(problem_size.Q),
+            'pad_h': str(problem_size.pad_h),
+            'pad_w': str(problem_size.pad_w),
+            'stride_h': str(problem_size.stride_h),
+            'stride_w': str(problem_size.stride_w),
+            'dilation_h': str(problem_size.dilation_h),
+            'dilation_w': str(problem_size.dilation_w),
+            'split_k_slices': str(problem_size.split_k_slices),
+            'split_k_mode': split_k_mode_,
+            'alpha': str(alpha),
+            'beta': str(beta),
+            'warmup': str(self.warmup_iterations),
+            'profile': str(self.iterations)
+        }
+
+        cmd_template = \
+            "${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
+            " --providers=${provider} --n=${n} --h=${h} --w=${w} --c=${c} --k=${k} --r=${r} --s=${s} --p=${p}" \
+            " --q=${q} --pad_h=${pad_h} --pad_w=${pad_w} --stride_h={stride_h} --stride_w=${stride_w}" \
+            " --dilation_h=${dilation_h} --dilation_w=${dilation_w} --warmup-iterations=${warmup} --profiling-iterations=${profile}" \
+            " --split_k_slices=${split_k_slices} --alpha=${alpha} --beta=${beta} --split_k_mode=${split_k_mode}"
+        
+        cmd = SubstituteTemplate(cmd_template, values)
+        result = subprocess.getoutput(cmd)
+
+        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
+        runtime = float(m.group('runtime'))
+
+        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
+        bytes = int(m.group('bytes'))
+
+        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
+        flops = int(m.group('flops'))
+
+        # check if the problem size matches
+        assert bytes == self.bytes(problem_size, alpha, beta)
+        assert flops == self.flops(problem_size)
+
+        return runtime
+
+
+
+    def run(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial,
+        alpha=1.0, beta=0.0):
+
+        assert get_allocated_size() == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
+
+        #
+        # Initialize input and output tensors
+        #
+        tensor_A_size = cutlass.conv.implicit_gemm_tensor_a_size(self.conv_kind, problem_size)
+        tensor_B_size = cutlass.conv.implicit_gemm_tensor_b_size(self.conv_kind, problem_size)
+        tensor_C_size = cutlass.conv.implicit_gemm_tensor_c_size(self.conv_kind, problem_size)
+        
+        np.random.seed(self.seed)
+
+        tensor_A = self.uniform_init(size=(tensor_A_size,), dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=(tensor_B_size,), dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=(tensor_C_size,), dtype=self.dtype_C)
+        tensor_D = np.zeros(shape=(tensor_C_size,), dtype=self.dtype_D)
+        
+
+        #
+        # Launch kernel
+        #
+
+        arguments = Conv2dArguments(
+            operation=self.operation, problem_size=problem_size, A=tensor_A,
+            B=tensor_B, C=tensor_C, D=tensor_D, 
+            output_op = LinearCombinationFunctorArguments(alpha, beta), 
+            split_k_slices=problem_size.split_k_slices,
+            split_k_mode=split_k_mode
+        )
+
+        if split_k_mode == cutlass.conv.SplitKMode.Parallel:
+            implicit_gemm_size = cutlass.conv.implicit_gemm_problem_size(self.operation.conv_kind, arguments.problem_size)
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()], partitions=problem_size.split_k_slices,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op = LinearCombinationFunctorArguments(alpha, beta)
+            )
+
+        self.operation.run(arguments)
+        if split_k_mode == cutlass.conv.SplitKMode.Parallel:
+            self.reduction_operation.run(reduction_arguments)
+        
+        passed = True
+        if self.verification:
+            if split_k_mode == cutlass.conv.SplitKMode.Parallel:
+                reduction_arguments.sync()
+            else:
+                arguments.sync()
+
+            tensor_D_ref = self.host_reference(problem_size, tensor_A, tensor_B, tensor_C, alpha, beta)
+            
+            passed = self.equal(tensor_D, tensor_D_ref, problem_size)
+
+            try: 
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, split_k_mode)
+        
+        if self.profiling:
+            sleep(self.sleep_time)
+            for _ in range(self.warmup_iterations):
+                self.operation.run(arguments)
+                if split_k_mode == cutlass.conv.SplitKMode.Parallel:
+                    self.reduction_operation.run(reduction_arguments)
+            
+            self.timer.start()
+            for _ in range(self.warmup_iterations):
+                self.operation.run(arguments)
+                if split_k_mode == cutlass.conv.SplitKMode.Parallel:
+                    self.reduction_operation.run(reduction_arguments)
+            self.timer.stop_and_wait()
+            runtime = self.timer.duration(self.iterations)
+        
+        # free memory
+        del arguments
+        if split_k_mode == cutlass.conv.SplitKMode.Parallel:
+            del reduction_arguments
+        
+        assert get_allocated_size() == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
+        if self.profiling:
+            return runtime
+        return passed
+
+
+
+########################################################################################################
+# TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+# TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+# Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+# (conv_blacklist_sizes)
+############################################################################################################
+
+def test_all_conv2d(operation: Conv2dOperation, conv_test_sizes = [], interleaved=False):  # TODO: conv_test_sizes and conv_blacklist_sizes
+    passed = True
+
+    #
+    # Testbed object
+    #
+
+    testbed = Conv2dLauncher(operation, interleaved=interleaved)
+
+    #
+    # Get conv problem sizes to run conv operator
+    #
+
+    conv_problems = cutlass.test.conv.TestbedConv2dProblemSizes(64)
+
+    # Vector of conv2d problem sizes to avoid duplicate runs
+    conv_tested_sizes = []
+
+    # TODO: include resnet 50 sizes, user sepecified sizes, and rigorous sizes
+    
+    # Flatten 2D problem_vectors into a 1D problem sizes
+    problem_sizes = conv_problems.conv2d_default_sizes
+    
+    problem_sizes = [conv_problem for conv_problem in problem_sizes] + conv_test_sizes
+
+    # Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slices=1, alpha=1.0, beta=0.0)
+    for conv_problem in problem_sizes:
+
+        # TODO: skip blacklist problem sizes
+        if conv_problem in conv_tested_sizes:
+            continue
+            
+        # skip channel dimension % 32 != 0 for interleaved case
+        if interleaved:
+            if conv_problem.K % 32 != 0 or conv_problem.C % 32 != 0:
+                continue
+    
+        #
+        # Procedurally disable certain cases
+        #
+
+        # CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+        if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Unity:
+            if not ((conv_problem.stride_h == 1) and (conv_problem.stride_w == 1)):
+                continue
+        
+        if not interleaved:
+            # Fixed channels algorithm requires channel count to match access size
+            if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.fixed_channels:
+                if conv_problem.C != operation.A.alignment:
+                    continue
+            
+            # Few channels algorithm requires channel count to match access size
+            if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.few_channels:
+                if conv_problem.C % operation.A.alignment:
+                    continue
+            
+            # CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w} 
+            # Although strided dgrad works for all stride combinations, we are only going 
+            # to run strided dgrad for non-unity strides 
+
+            if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
+                if (conv_problem.stride_h == 1) and (conv_problem.stride_w == 1):
+                    continue
+            
+        #
+        # Test
+        #
+
+        # push back tested problem size to avoid re-running duplicates
+        conv_tested_sizes.append(conv_problem)
+
+        passed = testbed.run(conv_problem)
+
+        # if not passed: return False
+
+        # TODO: If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
+
+    if interleaved:
+        return True
+    #
+    # filter the cases for split K
+    #
+
+    # Small-channels convolution can't run here.
+    if operation.iterator_algorithm in [cutlass.conv.IteratorAlgorithm.fixed_channels, cutlass.conv.IteratorAlgorithm.few_channels]:
+        return True
+    
+    # CUTLASS DGRAD's *stride* specialization does not support split-k mode
+    if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
+        conv_problem = cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 56, 56, 8),
+            cutlass.Tensor4DCoord(8, 1, 1, 8),
+            cutlass.Tensor4DCoord(0, 0, 0, 0),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        )
+        passed = testbed.run(conv_problem)
+
+        return passed
+    
+    # Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+    # a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+    # which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+    # alpha and beta for local testing, but only runs one value for alpha and beta.
+
+    conv2d_split_k_test_size = cutlass.conv.Conv2dProblemSize(
+        cutlass.Tensor4DCoord(1, 17, 11, 288),
+        cutlass.Tensor4DCoord(160, 3, 3, 288),
+        cutlass.Tensor4DCoord(1, 1, 1, 1),
+        cutlass.MatrixCoord(1, 1),
+        cutlass.MatrixCoord(1, 1),
+        cutlass.conv.Mode.cross_correlation,
+        1, 1
+    )
+
+    split_k_modes = [cutlass.conv.SplitKMode.Parallel, cutlass.conv.SplitKMode.Serial]
+
+    split_k_slices = [1, 2, 3, 4, 201]
+    problem_alpha = [2.0,]
+    problem_beta = [2.0,]
+
+    for split_k_mode in split_k_modes:
+        for split_k_slice in split_k_slices:
+            for alpha in problem_alpha:
+                for beta in problem_beta:
+                    passed = testbed.run(conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+                    split_k_mode,
+                    alpha, beta)
+                
+    return passed
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py
@ -0,0 +1,235 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pycutlass
+from pycutlass.test.gemm_testbed import getTensorRef, getTensorView, transpose
+from pycutlass import *
+import numpy as np
+import cutlass
+from bfloat16 import bfloat16
+
+
+class TestbedGrouped:
+    def __init__(self, operation: GemmOperationGrouped, seed: int = 2080) -> None:
+
+        pycutlass.compiler.add_module([operation])
+
+        self.seed = seed
+
+        self.operation = operation
+
+        element_size = DataTypeSize[operation.A.element]
+
+        self.dtype_A = self.numpy_type(operation.A.element)
+        self.dtype_B = self.numpy_type(operation.B.element)
+        self.dtype_C = self.numpy_type(operation.C.element)
+        self.dtype_D = self.numpy_type(operation.C.element)
+
+        if element_size == 1:
+            self.scope_max = 1
+            self.scope_min = 0
+        elif element_size <= 8:
+            self.scope_max = 1
+            self.scope_min = -1
+        elif element_size == 16:
+            self.scope_max = 4
+            self.scope_min = -4
+        else:
+            self.scope_max = 8
+            self.scope_min = -8
+
+        #: compute type
+        self.compute_type = operation.element_epilogue
+
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+
+    @staticmethod
+    def numpy_type(type):
+        if type == cutlass.float64:
+            return np.float64
+        elif type == cutlass.float32:
+            return np.float32
+        elif type == cutlass.float16:
+            return np.float16
+        elif type == cutlass.bfloat16:
+            return bfloat16
+        elif type == cutlass.int32:
+            return np.int32
+        elif type == cutlass.int8:
+            return np.int8
+        else:
+            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
+
+    def uniform_init(self, size, dtype):
+        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
+            return np.ceil(
+                np.random.uniform(
+                    low=self.scope_min - 0.5, high=self.scope_max - 0.5,
+                    size=size).astype(dtype)
+            )
+        else:
+            return np.random.uniform(
+                low=self.scope_min - 1, high=self.scope_max + 1,
+                size=size).astype(dtype)
+
+    def print_problem_size(self, p):
+        problem_size = "problem: %d, %d, %d\n" % (p.m(), p.n(), p.k())
+        print(problem_size)
+
+    def run(self, problem_count: int, alpha: float = 1.0, beta: float = 0.0) -> bool:
+
+        assert get_allocated_size(
+        ) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
+
+        # initialize
+        np.random.seed(self.seed)
+
+        # generate the problem sizes
+        problem_sizes = []
+        tensor_As = []
+        tensor_Bs = []
+        tensor_Cs = []
+        tensor_Ds = []
+        tensor_D_refs = []
+
+        for i in range(problem_count):
+            if self.dtype_A == np.int8:
+                if i == 0:
+                    problem_size = cutlass.gemm.GemmCoord(48, 16, 32)
+                else:
+                    problem_size = cutlass.gemm.GemmCoord(
+                        16 * np.random.randint(0, 64) + 48,
+                        16 * np.random.randint(0, 64) + 48,
+                        16 * np.random.randint(0, 64) + 48
+                    )
+            else:
+                if i == 0:
+                    problem_size = cutlass.gemm.GemmCoord(48, 16, 8)
+                else:
+                    problem_size = cutlass.gemm.GemmCoord(
+                        8 * np.random.randint(0, 64) + 24,
+                        8 * np.random.randint(0, 64) + 24,
+                        8 * np.random.randint(0, 64) + 24
+                    )
+
+            tensor_As.append(
+                self.uniform_init(
+                    size=(problem_size.m() * problem_size.k(),),
+                    dtype=self.dtype_A)
+            )
+            tensor_Bs.append(
+                self.uniform_init(
+                    size=(problem_size.n() * problem_size.k(),),
+                    dtype=self.dtype_B)
+            )
+            tensor_Cs.append(
+                self.uniform_init(
+                    size=(problem_size.m() * problem_size.n(),),
+                    dtype=self.dtype_C)
+            )
+
+            tensor_Ds.append(
+                np.zeros(
+                    shape=(problem_size.m() * problem_size.n(),),
+                    dtype=self.dtype_D
+                )
+            )
+
+            tensor_D_refs.append(
+                np.ones(
+                    shape=(problem_size.m() * problem_size.n(),),
+                    dtype=self.dtype_D
+                )
+            )
+
+            problem_sizes.append(problem_size)
+
+        arguments = GemmGroupedArguments(
+            operation=self.operation, problem_sizes=problem_sizes,
+            A=tensor_As, B=tensor_Bs, C=tensor_Cs, D=tensor_Ds,
+            output_op=LinearCombinationFunctorArguments(alpha, beta)
+        )
+
+        self.operation.run(arguments)
+
+        arguments.sync()
+
+        #
+        # Reference check - TODO: support caching results
+        #
+        alpha = self.compute_type(alpha).value()
+        beta = self.compute_type(beta).value()
+        init_acc = self.accumulator_type(0).value()
+
+        for idx, problem_size in enumerate(problem_sizes):
+            if self.operation.switched:
+                tensor_ref_A = getTensorRef(
+                    tensor_As[idx], problem_size, "a", transpose(self.operation.B.layout))
+                tensor_ref_B = getTensorRef(
+                    tensor_Bs[idx], problem_size, "b", transpose(self.operation.A.layout))
+                tensor_ref_C = getTensorRef(
+                    tensor_Cs[idx], problem_size, "c", transpose(self.operation.C.layout))
+                tensor_ref_D_ref = getTensorRef(
+                    tensor_D_refs[idx], problem_size, "d", transpose(self.operation.C.layout))
+            else:
+                tensor_ref_A = getTensorRef(
+                    tensor_As[idx], problem_size, "a", self.operation.A.layout)
+                tensor_ref_B = getTensorRef(
+                    tensor_Bs[idx], problem_size, "b", self.operation.B.layout)
+                tensor_ref_C = getTensorRef(
+                    tensor_Cs[idx], problem_size, "c", self.operation.C.layout)
+                tensor_ref_D_ref = getTensorRef(
+                    tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
+
+            tensor_view_D_ref = getTensorView(
+                tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
+
+            cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
+                                        tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
+
+            tensor_view_D = getTensorView(
+                tensor_Ds[idx], problem_size, "d", self.operation.C.layout)
+
+            passed = cutlass.test.gemm.host.equals(
+                tensor_view_D, tensor_view_D_ref)
+
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size)
+
+        del arguments
+
+        assert get_allocated_size(
+        ) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
+
+        return passed
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py
@ -0,0 +1,557 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from time import sleep
+import pycutlass
+from pycutlass import *
+import cutlass
+from cuda import cudart
+from cuda import cuda
+from bfloat16 import bfloat16
+from .profiler import GpuTimer
+import subprocess
+
+
+def transpose(layout):
+    if layout == cutlass.RowMajor:
+        return cutlass.ColumnMajor
+    elif layout == cutlass.ColumnMajor:
+        return cutlass.RowMajor
+    elif layout == cutlass.ColumnMajorInterleaved32:
+        return cutlass.RowMajorInterleaved32
+    elif layout == cutlass.RowMajorInterleaved32:
+        return cutlass.ColumnMajorInterleaved32
+
+
+def getTensorRef(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: cutlass.layout):
+    ptr = tensor.__array_interface__['data'][0]
+    if operand == "a":
+        tensor_coord = problem_size.mk()
+    elif operand == "b":
+        tensor_coord = problem_size.kn()
+    elif operand in ["c", "d"]:
+        tensor_coord = problem_size.mn()
+    else:
+        raise ValueError("unknonw operand: " + operand)
+
+    if layout == cutlass.RowMajor:
+        layout = cutlass.RowMajor.packed(tensor_coord)
+        layout_tag = "RowMajor"
+    elif layout == cutlass.ColumnMajor:
+        layout = cutlass.ColumnMajor.packed(tensor_coord)
+        layout_tag = "ColumnMajor"
+    elif layout == cutlass.ColumnMajorInterleaved32:
+        layout = cutlass.ColumnMajorInterleaved32.packed(tensor_coord)
+        layout_tag = "ColumnMajorInterleaved32"
+    elif layout == cutlass.RowMajorInterleaved32:
+        layout = cutlass.RowMajorInterleaved32.packed(tensor_coord)
+        layout_tag = "RowMajorInterleaved32"
+    else:
+        raise ValueError("unsupported layout")
+    if tensor.dtype == np.float32:
+        ref_name = "TensorRefF32" + layout_tag
+    elif tensor.dtype == np.float64:
+        ref_name = "TensorRefF64" + layout_tag
+    elif tensor.dtype == np.float16:
+        ref_name = "TensorRefF16" + layout_tag
+    elif tensor.dtype == bfloat16:
+        ref_name = "TensorRefBF16" + layout_tag
+    elif tensor.dtype == np.int8:
+        ref_name = "TensorRefS8" + layout_tag
+    elif tensor.dtype == np.int32:
+        ref_name = "TensorRefS32" + layout_tag
+    else:
+        raise ValueError("unsupported datatype %s" %
+                         ShortDataTypeNames[tensor.dtype])
+
+    return getattr(cutlass, ref_name)(ptr, layout)
+
+
+def getTensorView(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: str):
+    tensor_ref = getTensorRef(tensor, problem_size, operand, layout)
+
+    if operand == "a":
+        tensor_coord = problem_size.mk()
+    elif operand == "b":
+        tensor_coord = problem_size.kn()
+    elif operand in ["c", "d"]:
+        tensor_coord = problem_size.mn()
+    else:
+        raise ValueError("unknonw operand: " + operand)
+
+    if layout == cutlass.RowMajor:
+        layout_tag = "RowMajor"
+    elif layout == cutlass.ColumnMajor:
+        layout_tag = "ColumnMajor"
+    elif layout == cutlass.ColumnMajorInterleaved32:
+        layout_tag = "ColumnMajorInterleaved32"
+    elif layout == cutlass.RowMajorInterleaved32:
+        layout_tag = "RowMajorInterleaved32"
+    else:
+        raise ValueError("unsupported layout")
+    if tensor.dtype == np.float32:
+        ref_name = "TensorViewF32" + layout_tag
+    elif tensor.dtype == np.float64:
+        ref_name = "TensorViewF64" + layout_tag
+    elif tensor.dtype == np.float16:
+        ref_name = "TensorViewF16" + layout_tag
+    elif tensor.dtype == bfloat16:
+        ref_name = "TensorViewBF16" + layout_tag
+    elif tensor.dtype == np.int32:
+        ref_name = "TensorViewS32" + layout_tag
+    elif tensor.dtype == np.int8:
+        ref_name = "TensorViewS8" + layout_tag
+    else:
+        raise ValueError("unsupported datatype")
+
+    return getattr(cutlass, ref_name)(tensor_ref, tensor_coord)
+
+
+class GemmUniversalLauncher:
+    def __init__(self, operation: 'GemmOperationUniversal', seed: int = 2080, interleaved=False,
+                 verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
+        # create the reduction kernel
+        self.reduction_operation: ReductionOperation = ReductionOperation(
+            shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
+            C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+            element_compute=operation.element_epilogue,
+            count=operation.C.alignment
+        )
+
+        self.math_operation = operation.tile_description.math_instruction.math_operation
+
+        #: verify the output result
+        self.verification = verification
+        #: profile the kernel's runtime
+        self.profiling = profiling
+
+        self.timer = GpuTimer()
+
+        self.warmup_iterations = warmup_iterations
+        self.iterations = iterations
+
+        if "sleep" in kwargs.keys():
+            self.sleep_time = kwargs["sleep"]
+        else:
+            self.sleep_time = 0
+
+        #
+        # Compile the operator
+        #
+
+        pycutlass.compiler.add_module([operation, self.reduction_operation])
+
+        self.operation = operation
+
+        self.dtype_A = GemmUniversalLauncher.numpy_type(operation.A.element)
+        self.dtype_B = GemmUniversalLauncher.numpy_type(operation.B.element)
+        self.dtype_C = GemmUniversalLauncher.numpy_type(operation.C.element)
+        self.dtype_D = GemmUniversalLauncher.numpy_type(operation.C.element)
+
+        accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
+        element_size = DataTypeSize[operation.A.element]
+
+        if element_size == 1:
+            self.scope_max = 1
+            self.scope_min = 0
+        elif element_size <= 8:
+            self.scope_max = 1
+            self.scope_min = -1
+        elif element_size == 16:
+            self.scope_max = 4
+            self.scope_min = -4
+        else:
+            self.scope_max = 8
+            self.scope_min = -8
+
+        #: seed
+        self.seed: int = seed
+
+        #: whether the layout is interleaved
+        self.interleaved = interleaved
+
+        #: compute type
+        self.compute_type = operation.element_epilogue
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+
+    def print_problem_size(self, p, mode, batch_count):
+        if mode == cutlass.gemm.Mode.Gemm:
+            mode = "Gemm"
+        elif mode == cutlass.gemm.Mode.GemmSplitKParallel:
+            mode = "GemmSplitKParalel"
+        problem_size = "problem: %d, %d, %d\n batch_count: %d\n mode: %s" % (
+            p.m(), p.n(), p.k(), batch_count, mode)
+        print(problem_size)
+
+    @staticmethod
+    def numpy_type(type):
+        if type == cutlass.float64:
+            return np.float64
+        elif type == cutlass.float32:
+            return np.float32
+        elif type == cutlass.float16:
+            return np.float16
+        elif type == cutlass.bfloat16:
+            return bfloat16
+        elif type == cutlass.int32:
+            return np.int32
+        elif type == cutlass.int8:
+            return np.int8
+        else:
+            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
+
+    def uniform_init(self, size, dtype):
+        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
+            return np.ceil(
+                np.random.uniform(
+                    low=self.scope_min - 0.5, high=self.scope_max - 0.5,
+                    size=size).astype(dtype)
+            )
+        else:
+            return np.random.uniform(
+                low=self.scope_min - 1, high=self.scope_max + 1,
+                size=size).astype(dtype)
+
+    def reorder_tensor_B(self, tensor_B, problem_size):
+        reordered_tensor_B = np.empty_like(tensor_B)
+        tensor_ref_B = getTensorRef(
+            tensor_B, problem_size, "b", self.operation.B.layout)
+        reordered_tensor_ref_B = getTensorRef(
+            reordered_tensor_B, problem_size, "b", self.operation.B.layout)
+        cutlass.gemm.host.reorder_column(
+            tensor_ref_B, reordered_tensor_ref_B, problem_size)
+        return reordered_tensor_B
+
+    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        # TODO
+        tensor_D_ref = np.ones_like(tensor_C)
+        alpha = self.numpy_type(self.compute_type)(alpha)
+        beta = self.numpy_type(self.compute_type)(beta)
+        init_acc = 0
+
+        alpha = self.compute_type(alpha).value()
+        beta = self.compute_type(beta).value()
+        init_acc = self.accumulator_type(init_acc).value()
+
+        if self.operation.switched:
+            tensor_ref_A = getTensorRef(
+                tensor_A, problem_size, "a", transpose(self.operation.B.layout))
+            tensor_ref_B = getTensorRef(
+                tensor_B, problem_size, "b", transpose(self.operation.A.layout))
+            tensor_ref_C = getTensorRef(
+                tensor_C, problem_size, "c", transpose(self.operation.C.layout))
+            tensor_ref_D_ref = getTensorRef(
+                tensor_D_ref, problem_size, "d", transpose(self.operation.C.layout))
+        else:
+            tensor_ref_A = getTensorRef(
+                tensor_A, problem_size, "a", self.operation.A.layout)
+            tensor_ref_B = getTensorRef(
+                tensor_B, problem_size, "b", self.operation.B.layout)
+            tensor_ref_C = getTensorRef(
+                tensor_C, problem_size, "c", self.operation.C.layout)
+            tensor_ref_D_ref = getTensorRef(
+                tensor_D_ref, problem_size, "d", self.operation.C.layout)
+
+        if self.math_operation in [MathOperation.multiply_add_saturate]:
+            cutlass.test.gemm.host.gemm_saturate(
+                problem_size, alpha, tensor_ref_A, tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
+        else:
+            cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
+                                        tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
+
+        return tensor_D_ref
+
+    def equal(self, tensor_D, tensor_D_ref, problem_size):
+
+        tensor_view_D = getTensorView(
+            tensor_D, problem_size, "d", self.operation.C.layout)
+        tensor_view_D_ref = getTensorView(
+            tensor_D_ref, problem_size, "d", self.operation.C.layout)
+
+        return cutlass.test.gemm.host.equals(tensor_view_D, tensor_view_D_ref)
+
+    def bytes(self, problem_size, batch_count=1, alpha=1.0, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        bytes = \
+            (DataTypeSize[self.operation.A.element] * m // 8) * k + \
+            (DataTypeSize[self.operation.B.element] * n // 8) * k + \
+            (DataTypeSize[self.operation.C.element] * m // 8) * n
+
+        if beta != 0:
+            bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
+
+        bytes *= batch_count
+
+        return bytes
+
+    def flops(self, problem_size, batch_count=1):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        flops_ = (m * n * k + m * n) * 2 * batch_count
+
+        # TODO: complex
+        return flops_
+
+    def run_cutlass_profiler(self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0):
+
+        cutlass_path = os.getenv('CUTLASS_PATH')
+        assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
+
+        values = {
+            "profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
+            "kernel_name": self.operation.procedural_name(),
+            "verification_providers": "device",
+            "provider": "cutlass",
+            "m": str(problem_size.m()),
+            "n": str(problem_size.n()),
+            "k": str(problem_size.k()),
+            'split_k_slices': str(batch_count),
+            'alpha': str(alpha),
+            'beta': str(beta),
+            'warmup': str(self.warmup_iterations),
+            'profile': str(self.iterations)
+        }
+
+        cmd_template = \
+            "${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
+            " --providers=${provider} --m=${m} --n=${n} --k=${k}"
+
+        cmd = SubstituteTemplate(cmd_template, values)
+        result = subprocess.getoutput(cmd)
+
+        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
+        runtime = float(m.group('runtime'))
+
+        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
+        bytes = int(m.group('bytes'))
+
+        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
+        flops = int(m.group('flops'))
+
+        # check if the problem size matches
+        assert bytes == self.bytes(problem_size, alpha, beta)
+        assert flops == self.flops(problem_size)
+
+        return runtime
+
+    def run(self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0):
+
+        assert get_allocated_size(
+        ) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
+
+        np.random.seed(self.seed)
+
+        tensor_A = self.uniform_init(
+            size=(problem_size.m() * problem_size.k(),), dtype=self.dtype_A)
+        tensor_B = self.uniform_init(
+            size=(problem_size.n() * problem_size.k(),), dtype=self.dtype_B)
+        tensor_C = self.uniform_init(
+            size=(problem_size.m() * problem_size.n(),), dtype=self.dtype_C)
+        tensor_D = np.zeros(
+            shape=(problem_size.m() * problem_size.n(),), dtype=self.dtype_D)
+
+        #
+        # Launch kernel
+        #
+
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+            output_op=LinearCombinationFunctorArguments(alpha, beta),
+            gemm_mode=mode, split_k_slices=batch_count
+        )
+
+        if mode == cutlass.gemm.Mode.GemmSplitKParallel:
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation, problem_size=[
+                    problem_size.m(), problem_size.n()],
+                partitions=batch_count,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op=LinearCombinationFunctorArguments(alpha, beta)
+            )
+
+        self.operation.run(arguments)
+
+        if mode == cutlass.gemm.Mode.GemmSplitKParallel:
+            self.reduction_operation.run(reduction_arguments)
+
+        passed = True
+
+        if self.verification:
+            if mode == cutlass.gemm.Mode.GemmSplitKParallel:
+                reduction_arguments.sync()
+            else:
+                arguments.sync()
+            tensor_D_ref = self.host_reference(
+                problem_size, tensor_A, tensor_B, tensor_C, alpha, beta)
+            passed = self.equal(tensor_D, tensor_D_ref, problem_size)
+
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, mode, batch_count)
+
+        if self.profiling:
+            sleep(self.sleep_time)
+            for _ in range(self.warmup_iterations):
+                self.operation.run(arguments)
+                if mode == cutlass.gemm.Mode.GemmSplitKParallel:
+                    self.reduction_operation.run(reduction_arguments)
+
+            self.timer.start()
+            for _ in range(self.iterations):
+                self.operation.run(arguments)
+                if mode == cutlass.gemm.Mode.GemmSplitKParallel:
+                    self.reduction_operation.run(reduction_arguments)
+            self.timer.stop_and_wait()
+
+            runtime = self.timer.duration(self.iterations)
+
+        # free memory and clear buffers
+        del arguments
+        if mode == cutlass.gemm.Mode.GemmSplitKParallel:
+            del reduction_arguments
+
+        assert get_allocated_size(
+        ) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
+
+        if self.profiling:
+            return runtime
+        return passed
+
+
+def test_all_gemm(operation: 'GemmOperationUniversal', testcase="universal"):
+
+    passed = True
+
+    minimum_operand_element_size = min(
+        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
+    opcode_class = operation.tile_description.math_instruction.opcode_class
+
+    if opcode_class == cutlass.OpClass.Simt:
+        alignment = 1
+    else:
+        alignment = 128 // minimum_operand_element_size
+
+    # int8_t gemm alignment constrainst
+    if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 and operation.A.layout == cutlass.ColumnMajor:
+        alignment_m = 4
+    else:
+        alignment_m = alignment
+
+    if opcode_class == cutlass.OpClass.Simt and operation.B.element == cutlass.int8 and operation.A.layout == cutlass.RowMajor:
+        alignment_n = 4
+    else:
+        alignment_n = alignment
+
+    if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 \
+            and operation.B.element == cutlass.int8 \
+            and (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor):
+
+        alignment_k = 4
+    else:
+        alignment_k = alignment
+
+    threadblock_k = operation.tile_description.threadblock_shape[2]
+
+    if testcase == "interleaved":
+        if operation.A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
+            interleavedk = 32
+        else:
+            raise ValueError("unknonw layout")
+
+    if testcase == "interleaved":
+        modes = [cutlass.gemm.Mode.Gemm, ]
+        problem_size_m = [interleavedk, 512+interleavedk]
+        problem_size_n = [interleavedk, 512+interleavedk]
+        problem_size_k = [interleavedk, threadblock_k *
+                          operation.tile_description.stages + interleavedk]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1, ]
+    elif testcase == "multistage":
+        modes = [cutlass.gemm.Mode.Gemm, ]
+        problem_size_m = [16, 528]
+        problem_size_n = [16, 528]
+        problem_size_k = [threadblock_k, threadblock_k * operation.tile_description.stages +
+                          operation.tile_description.math_instruction.instruction_shape[2]]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1, ]
+    else:  # universal
+        modes = [cutlass.gemm.Mode.Gemm, cutlass.gemm.Mode.GemmSplitKParallel]
+        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
+        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
+        problem_size_k = [
+            alignment_k,
+            threadblock_k * operation.tile_description.stages - alignment_k,
+            threadblock_k * operation.tile_description.stages * 3 - alignment_k]
+        batch_counts = [1, 2, 3, 5, 7]
+        problem_alpha = [1.0]
+        problem_beta = [2.0]
+
+    testbed = GemmUniversalLauncher(
+        operation, interleaved=(testcase == "interleaved"))
+
+    for mode in modes:
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for k in problem_size_k:
+                    for batch_count in batch_counts:
+                        for alpha in problem_alpha:
+                            for beta in problem_beta:
+                                # skip very small K problems
+                                if testcase == "universal":
+                                    if (k // batch_count < 2 * threadblock_k):
+                                        continue
+
+                                problem_size = cutlass.gemm.GemmCoord(m, n, k)
+
+                                passed = testbed.run(
+                                    mode, problem_size, batch_count, alpha, beta)
+
+                                err, = cudart.cudaDeviceSynchronize()
+                                if err != cuda.CUresult.CUDA_SUCCESS:
+                                    raise RuntimeError(
+                                        "CUDA Error %s" % str(err))
+
+                                if not passed:
+                                    return False
+
+    return passed
--- a/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py
@ -0,0 +1,70 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cuda import cuda
+from cuda import cudart
+
+
+class GpuTimer:
+    def __init__(self) -> None:
+        self.events = [
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
+        ]
+
+    def start(self, stream=cuda.CUstream(0)):
+        err, = cuda.cuEventRecord(self.events[0], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+
+    def stop(self, stream=cuda.CUstream(0)):
+        err, = cuda.cuEventRecord(self.events[1], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+        pass
+
+    def stop_and_wait(self, stream=cuda.CUstream(0)):
+        self.stop(stream)
+        if stream:
+            err, = cuda.cuStreamSynchronize(stream)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+        else:
+            err, = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+    def duration(self, iterations=1):
+        err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+        return duration / float(iterations)
--- a/tools/library/scripts/pycutlass/src/pycutlass/type.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/type.py
@ -0,0 +1,39 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+from typing import Union
+from typeguard import typechecked
+
+
+GemmOperation = 'Union[GemmOperationUniversal, GemmOperationGrouped]'
+
+Tensor = 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]'
--- a/tools/library/scripts/pycutlass/src/pycutlass/utils/init.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/utils/init.py
@ -0,0 +1 @@
+from pycutlass.utils.reference_model import * 
--- a/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py
+++ b/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py
@ -0,0 +1,234 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import numpy as np
+import cutlass
+from pycutlass.library import TensorDescription
+from typing import Union
+try:
+    import torch
+    torch_available = True
+except ImportError:
+    torch_available = False
+
+class ReferenceModule:
+    def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription) -> None:
+        self.layout_A = A.layout
+        self.layout_B = B.layout
+        self.layout_C = C.layout
+    
+    def run(self, A: np.ndarray, B: np.ndarray, C: np.ndarray, problem_size: cutlass.gemm.GemmCoord, alpha: float=1.0, beta: float=0.0):
+        """
+        Compute the reference result on CPU
+        Args:
+            A: dense operator with shape (M, K) in row-major and (K, M) in column-major
+            B: dense operator with shape (K, N) in row-major and (N, K) in column-major
+            C: dense operator with shape (M, N) in row-major and (N, M) in column-major
+        """
+        M, N, K = problem_size.m(), problem_size.n(), problem_size.k()
+        if isinstance(A, np.ndarray):
+            if self.layout_A == cutlass.RowMajor:
+                A_row = np.reshape(A, newshape=(M, K))
+            else:
+                A_col = np.reshape(A, newshape=(K, M))
+                A_row = np.transpose(A_col, axes=(1, 0))
+            
+            if self.layout_B == cutlass.RowMajor:
+                B_row = np.reshape(B, newshape=(K, N))
+            else:
+                B_col = np.reshape(B, newshape=(N, K))
+                B_row = np.transpose(B_col, axes=(1, 0))
+
+            if self.layout_C == cutlass.RowMajor:
+                C_row = np.reshape(C, newshape=(M, N))
+            else:
+                C_col = np.reshape(C, newshape=(N, M))
+                C_row = np.transpose(C_col, axes=(1, 0))
+            
+            out_row = np.matmul(A_row, B_row) * alpha + C_row * beta
+
+            if self.layout_C == cutlass.ColumnMajor:
+                out = np.transpose(out_row, axes=(1, 0))
+            else:
+                out = out_row
+            
+            return out.ravel()
+
+        elif isinstance(A, torch.Tensor):
+            if self.layout_A == cutlass.RowMajor:
+                A_row = A.view((M, K))
+            else:
+                A_col = A.view((K, M))
+                A_row = torch.permute(A_col, (1, 0))
+            
+            if self.layout_B == cutlass.RowMajor:
+                B_row = B.view((K, N))
+            else:
+                B_col = B.view((N, K))
+                B_row = torch.permute(B_col, (1, 0))
+
+            if self.layout_C == cutlass.RowMajor:
+                C_row = C.view((M, N))
+            else:
+                C_col = C.view((N, M))
+                C_row = torch.permute(C_col, (1, 0))
+            
+            out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
+
+            if self.layout_C == cutlass.ColumnMajor:
+                out = torch.permute(out_row, (1, 0))
+            else:
+                out = out_row
+            
+            return torch.flatten(out)
+
+
+
+#####################################################################################################
+# Conv2d
+#####################################################################################################
+
+if torch_available:
+    class Conv2dReferenceModule:
+        def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription, kind: cutlass.conv.Operator.fprop) -> None:
+            self.layout_A = A.layout
+            self.layout_B = B.layout
+            self.layout_C = C.layout
+            self.kind = kind
+        
+        def run(self, 
+            A: Union[np.ndarray, torch.Tensor],
+            B: Union[np.ndarray, torch.Tensor],
+            C: Union[np.ndarray, torch.Tensor], problem_size, alpha=1.0, beta=0.0) -> np.ndarray:
+            """
+            Compute the reference result on CPU
+            """
+            n = problem_size.N
+            h = problem_size.H
+            w = problem_size.W
+            c = problem_size.C
+
+            k = problem_size.K
+            r = problem_size.R
+            s = problem_size.S
+
+            p = problem_size.P
+            q = problem_size.Q
+
+            stride_h = problem_size.stride_h
+            stride_w = problem_size.stride_w
+
+            pad_h = problem_size.pad_h
+            pad_w = problem_size.pad_w
+
+            dilation_h = problem_size.dilation_h
+            dilation_w = problem_size.dilation_w
+
+            groups = problem_size.groups
+
+            if isinstance(A, np.ndarray):
+                # the pytorch activation layout is NCHW
+                #             weight layout is Cout Cin Kh Kw (also NCHW)
+                if self.layout_A == cutlass.TensorNHWC:
+                    A_nhwc = np.reshape(A, newshape=(n, h, w, c))
+                    A_torch_nhwc = torch.from_numpy(A_nhwc).to("cuda")
+                    A_torch_nchw = torch.permute(A_torch_nhwc, (0, 3, 1, 2))
+                
+                if self.layout_B == cutlass.TensorNHWC:
+                    B_nhwc = np.reshape(B, newshape=(k, r, s, c))
+                    B_torch_nhwc = torch.from_numpy(B_nhwc).to("cuda")
+                    B_torch_nchw = torch.permute(B_torch_nhwc, (0, 3, 1, 2))
+                
+                if self.layout_C == cutlass.TensorNHWC:
+                    C_nhwc = np.reshape(C, newshape=(n, p, q, k))
+                    C_torch_nhwc = torch.from_numpy(C_nhwc).to("cuda")
+                    C_torch_nchw = torch.permute(C_torch_nhwc, (0, 3, 1, 2))
+            
+            elif isinstance(A, torch.Tensor):
+                if self.kind == cutlass.conv.Operator.wgrad:
+                    if self.layout_A == cutlass.TensorNHWC:
+                        A_nhwc = A.view((n, p, q, k))
+                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
+                
+                    if self.layout_B == cutlass.TensorNHWC:
+                        B_nhwc = B.view((n, h, w, c))
+                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
+                    
+                    if self.layout_C == cutlass.TensorNHWC:
+                        C_nhwc = C.view((k, r, s, c))
+                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
+                elif self.kind == cutlass.conv.Operator.dgrad:
+                    if self.layout_A == cutlass.TensorNHWC:
+                        A_nhwc = A.view((n, p, q, k))
+                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
+                    
+                    if self.layout_B == cutlass.TensorNHWC:
+                        B_nhwc = B.view((k, r, s, c))
+                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
+                    
+                    if self.layout_C == cutlass.TensorNHWC:
+                        C_nhwc = C.view((n, h, w, c))
+                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
+                else:
+                    if self.layout_A == cutlass.TensorNHWC:
+                        A_nhwc = A.view((n, h, w, c))
+                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
+                    
+                    if self.layout_B == cutlass.TensorNHWC:
+                        B_nhwc = B.view((k, r, s, c))
+                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
+                    
+                    if self.layout_C == cutlass.TensorNHWC:
+                        C_nhwc = C.view((n, p, q, k))
+                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
+
+            if self.kind == cutlass.conv.Operator.fprop:
+                D_torch_nchw = alpha * torch.nn.functional.conv2d(
+                    A_torch_nchw, B_torch_nchw, stride=(stride_h, stride_w),
+                    padding=(pad_h, pad_w), dilation=(dilation_h, dilation_w), groups=groups) + beta * C_torch_nchw
+            elif self.kind == cutlass.conv.Operator.dgrad:
+                D_torch_nchw = alpha * torch.nn.grad.conv2d_input(
+                    (n, c, h, w), B_torch_nchw, A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
+                ).to(torch.float32) + beta * C_torch_nchw
+            elif self.kind == cutlass.conv.Operator.wgrad:
+                D_torch_nchw = alpha * torch.nn.grad.conv2d_weight(
+                    B_torch_nchw, (k, c, r, s), A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
+                ).to(torch.float32) + beta * C_torch_nchw
+
+
+            if self.layout_C == cutlass.TensorNHWC:
+                if isinstance(A, np.ndarray):
+                    D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1)).detach().cpu().numpy()
+                elif isinstance(A, torch.Tensor):
+                    D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1))
+            
+            return D_torch_out.flatten()
--- a/tools/library/scripts/pycutlass/test/conv/init.py
+++ b/tools/library/scripts/pycutlass/test/conv/init.py
--- a/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
+++ b/tools/library/scripts/pycutlass/test/conv/cached_results_SM80.txt
@ -0,0 +1,274 @@
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 2624928614 3423533117 3186342135
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 2732296888 1838622641 4203745561
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3456572634 893492926 1966259884
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 4014726279 4027869577 1510990157
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 4140605332 3580988556 3425909428
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2106553169 835800311 3417471222
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 860217059 166776702 1109666471
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 855244826 2670006594 3857976152
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 3079461262 3579256638 2926210806
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2952423142 2045838875 3445165841
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 2133381336 2601441527 2035094220
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 1700915522 2515933441 406719240
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 156533442 1012781676 688128904
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 3612826298 2531545294 476754549
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 2391975923 197605094 3409942185
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3071904063 408984565 2378809888
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 3067676760 1540919649 2008865071
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 1085505037 2778215386 230227569
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2731079464 3570839563 3483629877
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 408419601 3415600242 2106927195
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 3606099389 4034802752 3200055633
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 3910244699 1319285699 2229775542
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 2780071616 2703730845 3090625734
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 4278696824 360883914 3802692600
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 653419877 359675571 283806385
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 1075980921 3101013494 2025203940
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
+conv2d fprop_1x8x8x1_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1883874274 1180207512 3934800419
+conv2d fprop_1x16x16x1_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 4230587034 4117433929 2540623821
+conv2d fprop_1x16x16x1_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 3802993432 1563447158 515257167
+conv2d fprop_1x224x224x1_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2583340103 3928463259 1564251818
+conv2d fprop_1x224x224x1_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2966178620 3457283045 1726663817
+conv2d fprop_1x224x224x1_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 3101289788 3492498648
+conv2d fprop_1x224x224x1_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 498358130 4111289929
+conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2693144988 3876248534 3038023830 1910263513
+conv2d fprop_1x16x16x2_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3355193355 319259163 535683577
+conv2d fprop_1x16x16x2_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 1548147432 3385829172 2741952709
+conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 2686562907 3948710179 3669872932
+conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 576815792 2317227037 1211532666
+conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 555460201 895685163
+conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 1465341652 2228916523
+conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 137535877 1436667267 1395660627
+conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 2226159049 4051661898 209529384
+conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 3541851870 2271016226 2671623385
+conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 2007343215 3362992769
+conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 20610297 1086800078
+conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3117444553 1497663382 3561001103
+conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 1414143072 827338392 2827855918
+conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 3886996022 26545788 3407771964
+conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 2374613655 3601677176
+conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 778374730 2110111988
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
+conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3254575292 1119957081 672831271
+conv2d fprop_1x4x4x14_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3115523958 3622905002 4020453928 3853387318
+conv2d fprop_1x23x56x98_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1702870033 1876930844 1190400523 3937287850
+conv2d fprop_1x4x4x28_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 2587856937 2021107274 2789519899
+conv2d fprop_1x23x56x100_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2368669977 1353376771 744357395 786349633
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 991402150 1393431534 2496492611 3901723984
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4208297221 4283492776 3148637036 258220505
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4178596783 3828059710 281106520 1103939403
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 924522595 563724475 1938163814 2197809394
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1021044158 1686067905 350851834 3999808950
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 2674994719 1034822169 1611033520
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 4201252830 1597212204 2181492560
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 70289262 3001492060 1379239000
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 1288095320 4211138051 2804617605
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 2202157489 1043108884 2923122465
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2476454437 1857118302 3877008798 1206012078
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2767650699 3514840131 2946529611 3907056932
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3896287283 3112762669 1581171257 3959460786
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1903067870 1021832870 1926804094 1756790353
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3489785028 2466126497 1712378956 434322965
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2051350923 263676708 355203300 821870356
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 719099834 1474713672 2886387159 4086314983
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3441724486 3162593831 1422796372 2049419539
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2034354027 1249407570 1196036582 2684312264
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 1060050551
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 3361618746
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 172579142 319546523 2332616929 543467298
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2823351660 1326352711 3839068434 65031397
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3238446487 2572503545 3604065639 2111204111
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2149247508 1775375365 2663631601 1249487679
+conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 403997062 1679063623 4062928786
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
+conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1092015789 3160693693 1526395881
+conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 2236679600 3168985259
+conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 3784328837 471971363
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 4106152802 2634710231 744755886
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 2709881923 2407415563
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 3723472741 3733128758 3129111191
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 2042513140 253288229 404121198
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1116254439 525487530 3284739065
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1743485155 91136873 2508716910
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 386662952 1127709182 4026285141
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 3954249564 2591894666 2655687700
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1263618595 1313664339
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1756414462 2995557277
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 447261065 121940906 1497499264
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 2966693627 1423016429 341928547
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1759979610 2761559427 68093525
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 2980501720 1650970502 3258883197
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 3502822733 3985958544 2568949300
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 3289288595 385631111 328914986
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 3391080565 1513955316 1521294163
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1669352457 2608107448 4284090805
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 1126870455 106232038 3054809396
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 4239438967
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 2113601884
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 2413490039 36034283 1112346965
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 1601750164 14375779 2894970748
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 1300976652 4259930640 305685205
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 1747587481 4137156526 1174257270
+conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1086820986 1644914756 2013471312
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
+conv2d wgrad_1x8x8x1_8x8_1x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 4278264698 2331753571 2554564568
+conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
+conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
+conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
+conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
+conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
+conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
+conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
+conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
+conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,187 @@
+# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+
+class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,162 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=4, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=4, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,89 @@
+# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32, 
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[4, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,86 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Unity,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,154 @@
+# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass.test import *
+import unittest
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 8, 8, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 16, 16, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 16, 16, channels),
+            cutlass.Tensor4DCoord(16, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(32, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=1)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=2, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,175 @@
+# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass.test import *
+import unittest
+
+def conv2d_fixed_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 8, 8, channels),
+            cutlass.Tensor4DCoord(16, 3, 3, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(32, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 7, 7, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass.conv.Conv2dProblemSize(
+            cutlass.Tensor4DCoord(1, 224, 224, channels),
+            cutlass.Tensor4DCoord(64, 5, 5, channels),
+            cutlass.Tensor4DCoord(1, 1, 1, 1),
+            cutlass.MatrixCoord(2, 2),
+            cutlass.MatrixCoord(1, 1),
+            cutlass.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
+    
+    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,291 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 14),
+                cutlass.Tensor4DCoord(8, 3, 3, 14),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 98),
+                cutlass.Tensor4DCoord(128, 3, 3, 98),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 14),
+                cutlass.Tensor4DCoord(8, 3, 3, 14),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 98),
+                cutlass.Tensor4DCoord(128, 3, 3, 98),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 28),
+                cutlass.Tensor4DCoord(8, 3, 3, 28),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 23, 56, 100),
+                cutlass.Tensor4DCoord(128, 3, 3, 100),
+                cutlass.Tensor4DCoord(4, 0, 5, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,48 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,87 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[4, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle2
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,98 @@
+# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=2)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=2)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            )
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,235 @@
+# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64], stages=3, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.StridedDgradIdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 56, 56, 12),
+                cutlass.Tensor4DCoord(8, 1, 1, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(2, 2),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 55, 55, 12),
+                cutlass.Tensor4DCoord(8, 1, 1, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(2, 2),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -0,0 +1,86 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float16,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float16, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,224 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=8)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 256, 32], stages=3, 
+            warp_count=[1, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 4, 4, 12),
+                cutlass.Tensor4DCoord(8, 3, 3, 12),
+                cutlass.Tensor4DCoord(0, 0, 0, 0),
+                cutlass.MatrixCoord(3, 3),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -0,0 +1,87 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+import pycutlass
+from pycutlass.conv2d_operation import *
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 8], stages=4, 
+            warp_count=[2, 4, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -0,0 +1,98 @@
+# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=4)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=8)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 16], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        self.assertTrue(test_all_conv2d(operation))
+    
+    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        A = TensorDescription(
+            element=math_inst.element_a, 
+            layout=cutlass.TensorNHWC,
+            alignment=1)
+        B = TensorDescription(
+            element=math_inst.element_b, 
+            layout=cutlass.TensorNHWC, 
+            alignment=1)
+        C = TensorDescription(
+            element=cutlass.float32,
+            layout=cutlass.TensorNHWC, 
+            alignment=4)
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32], stages=3, 
+            warp_count=[2, 2, 1],
+            math_instruction=math_inst,
+            min_compute=80, max_compute=80
+        )
+
+        operation = Conv2dOperation(
+            conv_kind=cutlass.conv.Operator.wgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
+            epilogue_functor=EpilogueFunctor.LinearCombination,
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+        
+        problem_sizes = [
+            cutlass.conv.Conv2dProblemSize(
+                cutlass.Tensor4DCoord(1, 8, 8, 1),
+                cutlass.Tensor4DCoord(1, 3, 3, 1),
+                cutlass.Tensor4DCoord(1, 1, 1, 1),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.MatrixCoord(1, 1),
+                cutlass.conv.Mode.cross_correlation,
+                1, 1
+            ),
+        ]
+        
+        self.assertTrue(test_all_conv2d(operation, problem_sizes))
+    
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/conv/run_all_tests.py
+++ b/tools/library/scripts/pycutlass/test/conv/run_all_tests.py
@ -0,0 +1,10 @@
+import pycutlass
+import unittest
+from pycutlass.memory_manager import *
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**32, 2**32)
+    loader = unittest.TestLoader()
+    tests = loader.discover('./', 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    testRunner.run(tests)
--- a/tools/library/scripts/pycutlass/test/frontend/run_test.sh
+++ b/tools/library/scripts/pycutlass/test/frontend/run_test.sh
@ -0,0 +1 @@
+CUPY_CACHE_DIR=./ python test_frontend.py
--- a/tools/library/scripts/pycutlass/test/frontend/test_frontend.py
+++ b/tools/library/scripts/pycutlass/test/frontend/test_frontend.py
@ -0,0 +1,136 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+## Test case for Pytorch
+import pycutlass
+import unittest
+from pycutlass import *
+import torch
+import cupy as cp
+
+
+class Test_Frontend(unittest.TestCase):
+    def setUp(self) -> None:
+        #
+        # define the cutlass operator
+        #
+        math_inst = MathInstruction(
+            [1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
+            cutlass.OpClass.Simt, MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            [128, 128, 8], 4, [2, 4, 1],
+            math_inst, 80, 80
+        )
+
+        A = TensorDescription(
+            cutlass.float32, cutlass.RowMajor, 1
+        )
+
+        B = TensorDescription(
+            cutlass.float32, cutlass.RowMajor, 1
+        )
+
+        C = TensorDescription(
+            cutlass.float32, cutlass.RowMajor, 1
+        )
+
+        self.operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=cutlass.float32,
+            epilogue_functor=EpilogueFunctor.LinearCombination, 
+            swizzling_functor=cutlass.IdentitySwizzle1
+        )
+
+        pycutlass.compiler.add_module([self.operation,])
+
+
+    def test_torch_frontend(self):
+        problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
+
+        tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+        tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+        tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
+        tensor_D = torch.empty_like(tensor_C)
+        
+
+        alpha = 1.0
+        beta = 0.0
+
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+            output_op=LinearCombinationFunctorArguments(alpha, beta),
+            gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
+        )
+
+        self.operation.run(arguments)
+
+        arguments.sync()
+
+        tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
+
+        self.assertTrue(torch.equal(tensor_D, tensor_D_ref))
+    
+    def test_cupy_frontend(self):
+        cp.cuda.set_allocator(rmm.rmm_cupy_allocator)
+
+        problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
+
+        tensor_A = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.m(), problem_size.k()), dtype=cp.float32))
+        tensor_B = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.k(), problem_size.n()), dtype=cp.float32))
+        tensor_C = cp.ceil(cp.random.uniform(low=-8.5, high=7.5, size=(problem_size.m(), problem_size.n()), dtype=cp.float32))
+        tensor_D = cp.ones_like(tensor_C)
+
+        alpha = 1.0
+        beta = 1.0
+
+        tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
+
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
+            output_op=LinearCombinationFunctorArguments(alpha, beta),
+            gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
+        )
+
+        self.operation.run(arguments)
+
+        arguments.sync()
+
+        self.assertTrue(cp.array_equal(tensor_D, tensor_D_ref))
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**32, 2**32)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/init.py
+++ b/tools/library/scripts/pycutlass/test/gemm/init.py
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py
@ -0,0 +1,93 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+class GemmBF16TensorOpSm80(unittest.TestCase):
+    def SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32_64x128x64_32x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.bfloat16, element_b=cutlass.bfloat16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 128, 64],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.bfloat16, element_b=cutlass.bfloat16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 128, 32],
+            stages=6, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.bfloat16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py
@ -0,0 +1,425 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+
+class GemmF16Sm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
+            direct_store=True
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+    def test_SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32_128x128x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 64],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64],
+            stages=3, warp_count=[2, 4, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_256x128x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[256, 128, 64],
+            stages=3, warp_count=[4, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 64, 64],
+            stages=3, warp_count=[2, 1, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float16
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32_64x64x32_32x32x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 32],
+            stages=10, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float16
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_256x128x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[256, 128, 64],
+            stages=3, warp_count=[4, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_test_SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 64, 64],
+            stages=3, warp_count=[2, 1, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64],
+            stages=3, warp_count=[2, 4, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.RowMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_128x256x64_64x64x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16],
+            element_a=cutlass.float16, element_b=cutlass.float16,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 256, 64],
+            stages=3, warp_count=[2, 4, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py
@ -0,0 +1,138 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.memory_manager import get_allocated_size
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+
+class GemmF32nF32nF32nTensorOpF32Sm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add_fast_bf16
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+
+    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_fast_accurate_f32_64x64x32_32x32x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 8],
+            element_a=cutlass.float32, element_b=cutlass.float32,
+            element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add_fast_f32
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    pycutlass.compiler.load_from_cache()
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py
@ -0,0 +1,95 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+class GemmF64TensorOpSm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64_32x32x16_16x16x16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[8, 8, 4],
+            element_a=cutlass.float64, element_b=cutlass.float64,
+            element_accumulator=cutlass.float64, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[32, 32, 16],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        # alignment 1 restricted for double
+        A = TensorDescription(
+            element=cutlass.float64, layout=cutlass.ColumnMajor,
+            alignment=1
+        )
+        B = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+        C = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float64
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+    
+    def test_SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64_64x64x16_32x32x16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[8, 8, 4],
+            element_a=cutlass.float64, element_b=cutlass.float64,
+            element_accumulator=cutlass.float64, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 16],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        # alignment 1 restricted for double
+        A = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+        B = TensorDescription(
+            element=cutlass.float64, layout=cutlass.ColumnMajor,
+            alignment=1
+        )
+        C = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float64
+
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "universal"))
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py
@ -0,0 +1,197 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_grouped_testbed import TestbedGrouped
+
+
+class GemmGroupedSm80(unittest.TestCase):
+    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16], element_a=cutlass.float16,
+            element_b=cutlass.float16, element_accumulator=cutlass.float32,
+            opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(24))
+    
+    def test_SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64_64x64x16_32x32x16(self):
+        math_inst = MathInstruction(
+            instruction_shape=[8, 8, 4], element_a=cutlass.float64,
+            element_b=cutlass.float64, element_accumulator=cutlass.float64,
+            opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 16],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        B = TensorDescription(
+            element=cutlass.float64, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        C = TensorDescription(
+            element=cutlass.float64, layout=cutlass.ColumnMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float64
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(24))
+    
+    def test_SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32_128x64x8_64x32x1(self):
+        math_inst = MathInstruction(
+            instruction_shape=[1, 1, 1], element_a=cutlass.float32,
+            element_b=cutlass.float32, element_accumulator=cutlass.float32,
+            opcode_class=cutlass.OpClass.Simt,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 64, 8],
+            stages=4, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        B = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.RowMajor,
+            alignment=1
+        )
+
+        element_epilogue = cutlass.float32
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(27))
+    
+    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32_cache(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 16], element_a=cutlass.float16,
+            element_b=cutlass.float16, element_accumulator=cutlass.float32,
+            opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 32],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        B = TensorDescription(
+            element=cutlass.float16, layout=cutlass.ColumnMajor,
+            alignment=8
+        )
+
+        C = TensorDescription(
+            element=cutlass.float32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.float32
+        epilogue_functor = EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass.BatchedIdentitySwizzle
+
+        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
+            operation = GemmOperationGrouped(
+                tile_description.minimum_compute_capability,
+                tile_description, A, B, C,
+                element_epilogue,
+                epilogue_functor, swizzling_functor,
+                precompute_mode=precompute_mode
+            )
+
+            testbed = TestbedGrouped(operation=operation)
+
+            self.assertTrue(testbed.run(5))
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py
+++ b/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py
@ -0,0 +1,219 @@
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+from pycutlass.test.gemm_testbed import test_all_gemm
+
+class GemmS8TensorOpF32Sm80(unittest.TestCase):
+    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_64x64x64_32x32x64(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add_saturate
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[64, 64, 64],
+            stages=6, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajorInterleaved32,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajorInterleaved32,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajorInterleaved32,
+            alignment=8
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "interleaved"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_256x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_128x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+
+        element_epilogue = cutlass.float32
+
+        epilogue_functor = EpilogueFunctor.FastLinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32_128x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int32, layout=cutlass.ColumnMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.int32
+
+        epilogue_functor = EpilogueFunctor.LinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+    def test_SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32_128x128x128_64x64x128(self):
+        math_inst = MathInstruction(
+            instruction_shape=[16, 8, 32],
+            element_a=cutlass.int8, element_b=cutlass.int8,
+            element_accumulator=cutlass.int32, opcode_class=cutlass.OpClass.TensorOp,
+            math_operation=MathOperation.multiply_add
+        )
+
+        tile_description = TileDescription(
+            threadblock_shape=[128, 128, 128],
+            stages=3, warp_count=[2, 2, 1],
+            math_instruction=math_inst, min_compute=80, max_compute=80
+        )
+
+        A = TensorDescription(
+            element=cutlass.int8, layout=cutlass.RowMajor,
+            alignment=16
+        )
+        B = TensorDescription(
+            element=cutlass.int8, layout=cutlass.ColumnMajor,
+            alignment=16
+        )
+        C = TensorDescription(
+            element=cutlass.int32, layout=cutlass.RowMajor,
+            alignment=4
+        )
+
+        element_epilogue = cutlass.int32
+
+        epilogue_functor = EpilogueFunctor.LinearCombinationClamp
+        
+        swizzling_functor = cutlass.IdentitySwizzle1
+
+        operation = GemmOperationUniversal(
+            arch=80, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+
+        self.assertTrue(test_all_gemm(operation, "multistage"))
+    
+
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**24, 2**24)
+    unittest.main()
--- a/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py
+++ b/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py
@ -0,0 +1,9 @@
+import pycutlass
+import unittest
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**26, 2**26)
+    loader = unittest.TestLoader()
+    tests = loader.discover('./', 'gemm_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    testRunner.run(tests)
--- a/tools/library/scripts/pycutlass/test/unit/cached_results_SM80_2080.txt
+++ b/tools/library/scripts/pycutlass/test/unit/cached_results_SM80_2080.txt
@ -0,0 +1,350 @@
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3016005301 4142905842
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3337296764 4183699161 3654176452
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3852963969 864006170 920352568
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 2750240096 2120184232 2600672872
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3224082300 2084034673 3588056946
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 3033073939 304048758 1882633089
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 610026473 447427404 2639856195
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2818680871 58428273 3332443900
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 1891702153 103393067 2558647731
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 162127134 3567670201 3173514764
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 162127134 3567670201 363897018
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 1350938697 1696306119 1005311005
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3884703009 3552725366 1975514757 1210310496
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3393706648 3519979618 1149261202 799742106
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3409586999 409840186 1724648597 2642018980
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1815685330 1398622058 2431638856 1016967269
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2555706782 3271563943 1020153035 299097281
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4173830187 736684125 472021975 2064613035
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 2751224679 2250540122 3725638844
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 1583610315 3287895411 2394340435
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3010335403 2356047354 7055632 915702611
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2748205217 2539405983 1217377670 2011175578
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2748205217 2114448427 249997769 2711364520
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1528321643 1532777511 3597171412 296622236
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1326617037 3415095747 847196866 1481554158
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1122706355 2841974626 2791878604 632900093
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1728385278 2462678309 3066040807 1334515660
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2175275779 1117731224 857614711 2096711962
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4140401170 3710340185 1683575469 317397427
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3552249008 2918315307 2290683130 536859016
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2869959072 2516947012 3328285094 2393284712
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1349264322 1823945068 400087667 2893025864
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3321662203 426084311 4233055093 4078572279
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3321662203 426084311 4233055093 3044377475
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 803041205 2521863610 3206942690 127091020
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4083508736 37801570 240515127 2234797539
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2207374588 535059558 2268619394 1489214085
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 dnhwc_dnhwc_dnhwc_d_d 3614026280 1721563676 2979825951 1104908081
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 2226238626 2053372396 2462697514
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 235646718 1374133172 3696289981
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 184705847 3148323124 84213385
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2264868815 1724845245 3498302256 4094034457
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 233390337 1801952602 3532884734
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3048346885 2306163504 642074123 4083120683
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2798030672 683783039 3025345160 1890891136
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1731071506 1844675436 2292509333 4006304179
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 132147677 604503886 143348844 3037223953
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 1678940393 3405733837 1820114523
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 1678940393 3405733837 467254076
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1794301352 2320042028 2134048179 508141072
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 561590023 3382154048 4154621995 517057927
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 593915463 2360210889 2685491481 2265099675
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 2226238626 1155815529 558646991
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 1876429398 4216128545 1754596046
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 348523586 2609019785 3938405680 2601133907
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1984146316 1475870285 1157657800 1143965395
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2971058593 1478256319 503014742 3930504182
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 1214508920 1537003531 3830217225
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2031518387 2695641559 933408074 4026827730
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 517276344 1158854831 3123629043
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 810746104 517276344 1448394173 1864626308
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2536722089 711164468 2465036841 2993377049
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2264868815 3003481795 333430991 3094857755
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1621735632 1126010692 3313703859 637497110
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1130094757 2605103293 2477101661 1276123281
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 4286533436 1302900889 2613245986 2523724148
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 3048346885 923365529 1681226722 417509256
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2798030672 3441819646 1293178065 188472807
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1731071506 1117530547 2706270359 502156742
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 132147677 2029225588 3851064913 3164530726
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 2337137106 3312954197 2466682688
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1269799445 2337137106 3312954197 2684544683
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 1794301352 72938921 2354994612 1463501392
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 252570564 2903451081 3619280116 1448586411
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 dnhwc_dnhwc_dnhwc_d_d 2037991187 1665743881 241585763 103256264
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 dnhwc_dnhwc_dnhwc_d_d 2653975581 3337638999 1440125233 2448165745
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_f_f 2149247508 1775375365 3317647029 2497607448
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 927718585 4106152802 720400339 3989318043
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 3464637181 4051957661 126285749
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 832653836 3723472741 2044236350 2463899842
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2075083065 2042513140 3691286135 322550345
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4005590448 1116254439 2328237343 1918824440
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 181075276 1743485155 3526891198 1979405632
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1513864544 386662952 4057300775 1456746562
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 856324887 3954249564 2340393915 4127188930
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 1300426008 2921497047 4145791960
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4110991321 1300426008 4080981223 3076991942
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 832653836 447261065 3823545045 392205236
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3484040069 2966693627 3900095420 919511892
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1513864544 1759979610 4272621682 1029257940
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1906605830 2980501720 978889789 3136018973
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 805717279 3502822733 1810065278 1387739380
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 868180534 3289288595 209477462 4142168174
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3437976747 3391080565 97275649 4063718293
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4195072693 1669352457 2182133559 2494741804
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3457330201 1126870455 319272291 3811977088
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 754609939 1723074453 1660326213 3902884425
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 754609939 1723074453 1660326213 423159249
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1690216859 2413490039 223529410 3303697952
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3168796339 1601750164 1428743330 403295189
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 261954979 1300976652 2749562370 3058142403
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_f_f 3747142491 1747587481 3143977827 835130482
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 4172720592 446082987
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1101653138 3727072529 875733988
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3906526127 655926291 939844058
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 2031878085 1709408312 1277173429
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 22652410 1700696921 2175632852
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 436588210 470857851 284463232
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 59350507 969037229 1510558485
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 856797938 2030818524 4231831552
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 2885833872 2829967135 3441569557
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 4148824382 2827420298 378131261
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 4148824382 2827420298 2955292920
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 1474248671 1302526250 4182204885
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1569788048 162506176 819639712 763595635
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 945660191 3750377696 2496492611 3515056508
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2806300501 2591577756 3148637036 3845512743
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2322444122 3525997046 281106520 3456307300
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 327345109 1137297282 1938163814 2551101563
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 797067973 481331945 350851834 2477733239
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 2044204046 1034822169 3340281844
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 4174274001 1597212204 1881272946
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1316460560 1535088984 3001492060 2308505016
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3190527989 3733991924 4211138051 3710311115
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3190527989 3430768821 1043108884 4185640072
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 943531303 1948306075 3877008798 2803592376
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3262141476 4125717435 2946529611 2221512094
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1599291337 3982786366 1581171257 1188352423
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2237070215 3046262465 1926804094 1435916873
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 721666814 2012769306 1712378956 1388990183
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1596349869 3775131163 355203300 1126174452
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1380587417 1208642645 2886387159 3113955983
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1332573203 1417735573 1422796372 3309229181
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2714027800 2106992819 1196036582 2095126659
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1105097447 1992731268 2198911423 3378137735
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1105097447 1992731268 2198911423 3868431311
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2552471160 2218470296 2332616929 923645661
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2231354584 4035702005 3839068434 8981294
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 4019719318 3985307916 3604065639 277096636
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 bf16nhwc_bf16nhwc_fnhwc_f_f 258381429 3482776077 2663631601 593179089
+conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 188810648 1623218578 2585892217
+conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 691990354 3253144559 2988350639
+conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 1670375523 2425320272 2553108650
+conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1049321188 1865889553 3610888033 1459693945
+conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 3236781482 1382111427 1986396315
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 463742721 2524037630 3070473696 210045128
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 738614177 4071452982 3401957738 2920893800
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2479111539 2662555669 781892324 2338234282
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2089076160 260434096 1539389419 1219120658
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 3344412669 2885305868 1926445693
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 3344412669 2885305868 1478058549
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3945616248 4118489020 2885143346 1545684873
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 295760528 1685244361 3337423971 772814550
+conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 623727338 942771643 2634710231 3063349371
+conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 188810648 2709881923 3532383400
+conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 3762161398 3733128758 3693097785
+conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 139944998 3812563855 253288229 1359907535
+conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 492562992 3677108443 525487530 445191233
+conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 594197095 3773864559 91136873 4170763393
+conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 1025574686 1127709182 677727764
+conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1901075489 3296829308 2591894666 2932517926
+conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 4223561525 1263618595 50680160
+conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1613915345 4223561525 1756414462 3209752057
+conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2788041828 1023542180 121940906 624551470
+conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1049321188 296097075 1423016429 1058165639
+conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3820648800 4160685370 2761559427 1788182893
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1859384988 222880684 1650970502 1632078530
+conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 1704522433 2403392926 3985958544 1432584676
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 463742721 3455033786 385631111 1683348880
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 738614177 3199562330 1513955316 2131256035
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2479111539 2702777753 2608107448 4014212857
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2089076160 4042009058 106232038 1140762595
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 2260768172 1186911503 3194129408
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 14838294 2260768172 1186911503 1312312812
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 3945616248 2287161276 36034283 4262860382
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 2906914535 476297538 14375779 1340176713
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 bf16nhwc_bf16nhwc_fnhwc_f_f 4292101959 3378414564 4259930640 1392755176
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 bf16nhwc_bf16nhwc_fnhwc_f_f 3529371817 368260304 4137156526 122558013
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 2948718568 2631391783 3260825675 4278587299
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 1635109696 2835574424 4179385325 2803281440
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 3344954627 1649157278 2032056735 1176638626
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 61750237 3452849177 1697665310 3475459781
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 1394759191 1571308277 898534533 4125341936
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 3402206912 2433594404 1575577431 4106154211
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 98638790 2735493952 346473870 1911666301
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 98638790 2735493952 346473870 2124440208
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 2934485636 3286257323 541566528 1113783492
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nc32hw32_s8c32rsk32_s8nc32hw32_i_f 164942943 4259285988 1250700182 508419908
+conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3805460372 2607401558 3465030781 210641751
+conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 4200926784 1001915027 387475271 3360115596
+conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 331078659 469730619 2547196469 1620698703
+conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 431968022 1614654085 903827412 1349891842
+conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3674369485 1055554271 3217013807 1356703347
+conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 3227824772 365527403 2720889763
+conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 2150996976 2899308770 2371758816
+conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2609462247 2124373651 2711906981 3194739760
+conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1070162100 2750964634 3090791018 3481982191
+conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1070162100 1563941622 767747438 3163252390
+conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 884815233 3576251756 3216742798 3534462723
+conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3230717758 3192193994 1161445944 371179683
+conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2450454245 2905280248 910194866 839083662
+conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2948718568 2631391783 638794727 4292051282
+conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1635109696 2835574424 1855687620 130932480
+conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3344954627 1649157278 4191418350 958044197
+conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 61750237 3452849177 3260472389 771128506
+conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 1394759191 1571308277 4279538191 956191103
+conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 3402206912 2433594404 2021112123 2983097553
+conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 98638790 2735493952 3178839372 568554158
+conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 98638790 2735493952 3178839372 18194802
+conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2934485636 3286257323 2559221535 2310182528
+conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 164942943 4259285988 984016853 888753301
+conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 s8nhwc_s8nhwc_inhwc_i_i 2823094147 1681845497 4242738907 3244428635
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 s8nhwc_s8nhwc_inhwc_i_i 4060010502 2881035321 3927119619 3311661122
--- a/tools/library/scripts/pycutlass/test/unit/test_sm80.py
+++ b/tools/library/scripts/pycutlass/test/unit/test_sm80.py
@ -0,0 +1,440 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+## Test case generator for SM80
+
+import pycutlass
+from pycutlass import *
+from pycutlass.test import *
+import unittest
+
+#
+# Create GEMM operation
+#
+
+def TestGemmOperator(gemm_kind, math_inst, layout, alignment, tiling, arch, mixed=False,
+    epilogue_functor = EpilogueFunctor.LinearCombination, 
+    swizzling_functor=cutlass.IdentitySwizzle1, **kwargs):
+    """
+    Test GEMM Operation based on configuration
+    """
+
+    if "data_type" in kwargs.keys():
+        data_type = kwargs["data_type"]
+    else:
+        if mixed or math_inst.element_a == cutlass.bfloat16:
+            data_type = [
+                math_inst.element_a,
+                math_inst.element_b,
+                math_inst.element_accumulator,
+                math_inst.element_accumulator
+            ]
+        else:
+            data_type = [
+                math_inst.element_a,
+                math_inst.element_b,
+                math_inst.element_a,
+                math_inst.element_accumulator
+            ]
+    
+    tile_description = TileDescription(
+        tiling[0], tiling[1], tiling[2],
+        math_inst, arch, arch
+    )
+
+    A = TensorDescription(
+        data_type[0], layout[0], alignment[0]
+    )
+
+    B = TensorDescription(
+        data_type[1], layout[1], alignment[1]
+    )
+
+    C = TensorDescription(
+        data_type[2], layout[2], alignment[2]
+    )
+
+    element_epilogue = data_type[3]
+
+    if gemm_kind == GemmKind.Universal:
+        operation = GemmOperationUniversal(
+            arch=arch, tile_description=tile_description,
+            A=A, B=B, C=C, element_epilogue=element_epilogue, 
+            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
+        )
+        if A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
+            return test_all_gemm(operation, "interleaved")
+        else:
+            return test_all_gemm(operation, "universal")
+        
+    elif gemm_kind == GemmKind.Grouped:
+        operation = GemmOperationGrouped(
+            arch, tile_description, A, B, C,
+            element_epilogue, epilogue_functor, swizzling_functor,
+            precompute_mode=kwargs["precompute_mode"]
+        )
+        testbed = TestbedGrouped(operation=operation)
+        return testbed.run(24)
+    else:
+        raise NotImplementedError("the gemm kind is not implemented")
+
+
+def TestConv2dOperator(math_inst, alignment, tiling, arch, 
+    stride_supports=[StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided],
+    epilogue_functor=EpilogueFunctor.LinearCombination, 
+    swizzling_functor=cutlass.IdentitySwizzle1, interleaved=False, **kwargs):
+    """
+    Test Conv2d Operation based on configurations
+    """
+
+    mixeds = [False, True, False]
+    conv_kinds = [cutlass.conv.Operator.fprop, cutlass.conv.Operator.dgrad, cutlass.conv.Operator.wgrad]
+
+    results = []
+
+    default_swizzling_functor = swizzling_functor
+
+    if "layout" in kwargs.keys():
+        layout = kwargs["layout"]
+    else:
+        layout = (cutlass.TensorNHWC, cutlass.TensorNHWC, cutlass.TensorNHWC)
+
+    for mixed, conv_kind, stride_support in zip(mixeds, conv_kinds, stride_supports):
+
+        if "data_type" in kwargs.keys():
+            data_type = kwargs["data_type"]
+        else:
+            if mixed or math_inst.element_a == cutlass.bfloat16:
+                data_type = [
+                    math_inst.element_a,
+                    math_inst.element_b,
+                    math_inst.element_accumulator,
+                    math_inst.element_accumulator
+                ]
+            else:
+                data_type = [
+                    math_inst.element_a,
+                    math_inst.element_b,
+                    math_inst.element_a,
+                    math_inst.element_accumulator
+                ]
+        # skip Int8 Conv Backward
+        if data_type[0] == cutlass.int8 and conv_kind in [cutlass.conv.Operator.dgrad, cutlass.conv.Operator.wgrad]:
+            continue
+
+        A = TensorDescription(
+            element=data_type[0],
+            layout=layout[0],
+            alignment=alignment[0])
+        B = TensorDescription(
+            element=data_type[1],
+            layout=layout[1], 
+            alignment=alignment[1])
+        C = TensorDescription(
+            element=data_type[2],
+            layout=layout[2], 
+            alignment=alignment[2])
+        
+        tile_description = TileDescription(
+            threadblock_shape=tiling[0], stages=tiling[1], 
+            warp_count=tiling[2],
+            math_instruction=math_inst,
+            min_compute=arch, max_compute=arch
+        )
+
+        if conv_kind == cutlass.conv.Operator.dgrad and stride_support == StrideSupport.Strided:
+            swizzling_functor = cutlass.StridedDgradIdentitySwizzle1
+        else:
+            swizzling_functor = default_swizzling_functor
+
+        operation = Conv2dOperation(
+            conv_kind=conv_kind, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
+            arch=arch, tile_description=tile_description, A=A, B=B, C=C, 
+            element_epilogue=data_type[3], stride_support=stride_support,
+            epilogue_functor=epilogue_functor,
+            swizzling_functor=swizzling_functor
+        )
+        
+        results.append(test_all_conv2d(operation, interleaved=interleaved))
+    
+    return results
+
+
+
+class Test_SM80(unittest.TestCase):
+    def test_SM80_TensorOp_16816(self):
+        math_instructions = [
+            MathInstruction(
+                [16, 8, 16], cutlass.float16, cutlass.float16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            ),
+            MathInstruction(
+                [16, 8, 16], cutlass.float16, cutlass.float16, cutlass.float16,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            ),
+            MathInstruction(
+                [16, 8, 16], cutlass.bfloat16, cutlass.bfloat16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            )
+        ]
+
+        layouts = [
+            (cutlass.RowMajor, cutlass.RowMajor, cutlass.RowMajor),
+            (cutlass.ColumnMajor, cutlass.RowMajor, cutlass.RowMajor),
+            (cutlass.RowMajor, cutlass.ColumnMajor, cutlass.RowMajor)
+        ]
+
+        alignments = [
+            (8, 8, 8), (4, 8, 8), (8, 4, 8)
+        ]
+
+        tilings = [
+            ([256, 128, 32], 3, [4, 2, 1]),
+            ([64, 256, 32], 4, [1, 4, 1]),
+            ([128, 64, 64], 3, [2, 2, 1])
+        ]
+
+        for math_inst, layout, alignment, tiling in zip(math_instructions, layouts, alignments, tilings):
+            self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False))
+            self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Host))
+            stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+            results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports)
+            for res in results:
+                self.assertTrue(res)
+
+    def test_SM80_TensorOp_1688(self):
+        # tf32 is not supported by most of python environment. Skip the test
+        self.assertTrue(True)
+    
+    def test_SM80_TensorOp_1688_fast_math(self):
+        math_instructions = [
+            MathInstruction(
+                [16, 8, 8], cutlass.tfloat32, cutlass.tfloat32, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add
+            ),
+            MathInstruction(
+                [16, 8, 8], cutlass.float16, cutlass.float16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_f16
+            ),
+            MathInstruction(
+                [16, 8, 8], cutlass.bfloat16, cutlass.bfloat16, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_bf16
+            ),
+            MathInstruction(
+                [16, 8, 8], cutlass.float32, cutlass.float32, cutlass.float32,
+                cutlass.OpClass.TensorOp, MathOperation.multiply_add_fast_f32
+            )
+        ]
+
+        layouts = [
+            (cutlass.RowMajor, cutlass.RowMajor, cutlass.ColumnMajor),
+            (cutlass.RowMajor, cutlass.ColumnMajor, cutlass.ColumnMajor),
+            (cutlass.ColumnMajor, cutlass.RowMajor, cutlass.ColumnMajor),
+            (cutlass.ColumnMajor, cutlass.ColumnMajor, cutlass.RowMajor)
+        ]
+        alignments = [
+            (4, 4, 4), (4, 2, 4), (2, 4, 4), (2, 2, 4)
+        ]
+        tilings = [
+            ([128, 256, 16], 3, [4, 2, 1]),
+            ([64, 256, 16], 4, [1, 4, 1]),
+            ([128, 64, 32], 3, [2, 2, 1]),
+            ([256, 64, 32], 3, [4, 2, 1])
+        ]
+        data_type = [
+            cutlass.float32, cutlass.float32, cutlass.float32, cutlass.float32
+        ]
+        for math_inst, layout, alignment, tiling in zip(math_instructions, layouts, alignments, tilings):
+            self.assertTrue(
+                TestGemmOperator(
+                    GemmKind.Universal, math_inst, layout, 
+                    alignment, tiling, 80, False, data_type=data_type))
+            self.assertTrue(
+                TestGemmOperator(
+                    GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, 
+                    True, precompute_mode=SchedulerMode.Device, data_type=data_type))
+            stride_supports = [StrideSupport.Unity, StrideSupport.Strided, StrideSupport.Unity]
+            results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+            for res in results:
+                self.assertTrue(res)
+
+    def test_SM80_TensorOp_884(self):
+        math_inst = MathInstruction(
+            [8, 8, 4], cutlass.float64, cutlass.float64, cutlass.float64,
+            cutlass.OpClass.TensorOp, MathOperation.multiply_add
+        )
+        layout = (cutlass.ColumnMajor, cutlass.ColumnMajor, cutlass.ColumnMajor)
+        alignment = (1, 1, 1)
+
+        tiling = ([64, 256, 16], 3, [2, 4, 1])
+        data_type = [cutlass.float64, cutlass.float64, cutlass.float64, cutlass.float64]
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type))
+        stride_supports = [StrideSupport.Unity, StrideSupport.Strided, StrideSupport.Unity]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+    
+    def test_SM80_TensorOp_16832_TN(self):
+        math_inst = MathInstruction(
+            [16, 8, 32], cutlass.int8, cutlass.int8, cutlass.int32,
+            cutlass.OpClass.TensorOp, MathOperation.multiply_add_saturate
+        )
+        layout = (cutlass.RowMajor, cutlass.ColumnMajor, cutlass.ColumnMajor)
+        alignment = (16, 16, 4)
+        alignment_mixed = (16, 16, 16)
+        tiling = ([128, 256, 64], 3, [2, 4, 1])
+
+        data_type = [cutlass.int8, cutlass.int8, cutlass.int32, cutlass.int32]
+        data_type_mixed = [cutlass.int8, cutlass.int8, cutlass.int8, cutlass.float32]
+
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment_mixed, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type_mixed))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+    
+    def test_SM80_Simt_f32(self):
+        math_inst = MathInstruction(
+            [1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
+            cutlass.OpClass.Simt, MathOperation.multiply_add
+        )
+        layout = (cutlass.RowMajor, cutlass.RowMajor, cutlass.RowMajor)
+        alignment = (1, 1, 1)
+
+        tiling = ([128, 256, 8], 4, [2, 4, 1])
+        data_type = [cutlass.float32, cutlass.float32, cutlass.float32, cutlass.float32]
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Host, data_type=data_type))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+
+    def test_SM80_Simt_f64(self):
+        math_inst = MathInstruction(
+            [1, 1, 1], cutlass.float64, cutlass.float64, cutlass.float64,
+            cutlass.OpClass.Simt, MathOperation.multiply_add
+        )
+        layout = (cutlass.RowMajor, cutlass.RowMajor, cutlass.ColumnMajor)
+        alignment = (1, 1, 1)
+
+        tiling = ([64, 128, 8], 5, [2, 2, 1])
+        data_type = [cutlass.float64, cutlass.float64, cutlass.float64, cutlass.float64]
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment, tiling, 80, False, data_type=data_type))
+        self.assertTrue(TestGemmOperator(GemmKind.Grouped, math_inst, layout, alignment, tiling, 80, True, precompute_mode=SchedulerMode.Device, data_type=data_type))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        results = TestConv2dOperator(math_inst, alignment, tiling, 80, stride_supports=stride_supports, data_type=data_type)
+        for res in results:
+            self.assertTrue(res)
+
+    def test_SM80_TensorOp_16832_Interleaved(self):
+        math_inst = MathInstruction(
+            [16, 8, 32], cutlass.int8, cutlass.int8, cutlass.int32,
+            cutlass.OpClass.TensorOp, MathOperation.multiply_add_saturate
+        )
+
+        layout = (cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32, cutlass.ColumnMajorInterleaved32)
+        alignment_mixed = (16, 16, 8)
+        tiling = ([256, 64, 64], 4, [4, 1, 1])
+        data_type_mixed = [cutlass.int8, cutlass.int8, cutlass.int8, cutlass.float32]
+
+        self.assertTrue(TestGemmOperator(GemmKind.Universal, math_inst, layout, alignment_mixed, tiling, 80, False, data_type=data_type_mixed, epilogue_functor=EpilogueFunctor.FastLinearCombinationClamp))
+        stride_supports = [StrideSupport.Strided, StrideSupport.Strided, StrideSupport.Strided]
+        layout = [cutlass.TensorNC32HW32, cutlass.TensorC32RSK32, cutlass.TensorNC32HW32]
+        results = TestConv2dOperator(math_inst, alignment_mixed, tiling, 80, stride_supports=stride_supports, data_type=data_type_mixed, layout=layout, interleaved=True)
+        for res in results:
+            self.assertTrue(res)
+
+    def SM80_SparseTensorOp_16832(self):
+        pass
+    def test_SM80_PlanarComplexTensorOp_16816(self):
+        pass
+    def test_SM80_SparseTensorOp_16816_fast_math(self):
+        pass
+    def test_SM80_TensorOp_1688_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_fast_fp32_math_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_rank_k(self):
+        pass
+    def test_SM80_TensorOp_1688_rank_k_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_trmm(self):
+        pass
+    def test_SM80_TensorOp_1688_trmm_complex(self):
+        pass
+    def test_SM80_TensorOp_1688_symm(self):
+        pass
+    def test_SM80_TensorOp_1688_symm_complex(self):
+        pass
+    def test_SM80_TensorOp_884_complex(self):
+        pass
+    def test_SM80_TensorOp_884_complex_gaussian(self):
+        pass
+    def test_SM80_TensorOp_884_rank_k(self):
+        pass
+    def test_SM80_TensorOp_884_rank_k_complex(self):
+        pass
+    def test_SM80_TensorOp_884_rank_k_complex_gaussian(self):
+        pass
+    def test_SM80_TensorOp_884_trmm(self):
+        pass
+    def test_SM80_TensorOp_884_trmm_complex(self):
+        pass
+    def test_SM80_TensorOp_884_trmm_complex_gaussian(self):
+        pass
+    def test_SM80_TensorOp_884_symm(self):
+        pass
+    def test_SM80_TensorOp_884_symm_complex(self):
+        pass
+    def test_SM80_TensorOp_884_symm_complex_gaussian(self):
+        pass
+    def test_SM80_SparseTensorOp_16864_TN(self):
+        pass
+    def test_SM80_TensorOp_16864_TN(self):
+        pass
+    def test_SM80_SparseTensorOp_168128_TN(self):
+        pass
+    def test_SM80_TensorOp_16864_Interleaved(self):
+        pass
+    def test_SM80_TensorOp_168256(self):
+        pass
+    def test_SM80_Simt_complex(self):
+        pass
+
+
+if __name__ == '__main__':
+    pycutlass.get_memory_pool(2**20, 2**34)
+    pycutlass.compiler.nvcc()
+    unittest.main()
				`@ -0,0 +1 @@`
				`from pycutlass.utils.reference_model import *`