CUTLASS 3.2.1 (#1113)

* Updates for 3.2.1 release. * Minor fix in gemm op profiler for raster order. * Add scheduler mapping for raster order in the kernels.
2023-09-26 14:24:26 -07:00
parent e0aaa3c3b3
commit 90d3b0fb18
428 changed files with 22253 additions and 21762 deletions
--- a/test/python/cutlass/conv2d/conv2d_problem_sizes.py
+++ b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
@ -0,0 +1,660 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for defining Conv2D problem sizes for testing.
+
+This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
+"""
+
+import cutlass
+from cutlass import ConvMode
+from cutlass.shape import Conv2DProblemSize
+
+
+class TestbedConv2dProblemSizes:
+    def __init__(self, minimum_channel_size: int):
+        conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
+        conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
+        conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
+        conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
+        grouped_sizes = self.initialize_conv2d_grouped_sizes()
+
+        # Filter all problems
+        self.all = []
+        for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
+            for size in size_list:
+                if (size.C // size.groups) % minimum_channel_size == 0:
+                    self.all.append(size)
+
+
+    def initialize_conv2d_default_sizes(self, minimum_channel_size):
+        # Small input size x stride (1,1)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+
+        conv2d_default_sizes = []
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 1, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 8, minimum_channel_size,
+          8, 1, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 8, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 9, minimum_channel_size,
+          8, 4, 4, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          2, 7, 9, minimum_channel_size,
+          8, 5, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 6, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 7, 7, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##############################################
+        # Small input size x stride (2,2)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##############################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 11, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 17, 19, minimum_channel_size,
+          16, 2, 2, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 5, minimum_channel_size,
+          16, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 17, 8,
+          24, 3, 3, 8,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 8,
+          24, 3, 3, 8,
+          1, 1,
+          3, 3,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 20, 24, 8,
+          40, 3, 3, 8,
+          3, 3,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 160,
+          224, 1, 1, 160,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 19, 37, 160,
+          224, 3, 3, 160,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 16, 160,
+          224, 2, 3, 160,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 128,
+          224, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 29, 37, 160,
+          224, 5, 5, 160,
+          2, 2,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 32 + minimum_channel_size,
+          96, 3, 3, 32 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 24, 64 + minimum_channel_size,
+          96, 3, 3, 64 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 16, 288,
+          160, 5, 5, 288,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 55, 51, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 71, 80, 32,
+          64, 5, 5, 32,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 224, 224, 8,
+          64, 7, 7, 8,
+          3, 3,
+          2, 2,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size stride (3, 3), filter (3, 3), non-default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 23, 256,
+          512, 3, 3, 256,
+          0, 0,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size padding > stride, asymmetric filter, padding and striding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 31, 256,
+          512, 3, 3, 256,
+          5, 7,
+          3, 4,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 35, 256,
+          512, 7, 5, 256,
+          11, 7,
+          3, 5,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size *mixed* stride (1, 2) and (2, 1),
+        # filter (3, 3), default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          1, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          2, 1,
+          1, 1,
+        ))
+
+        ######################################/
+        # Additional input size
+        ######################################/
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 28, 28, 256,
+          256, 2, 2, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+           1, 32, 32, 16,
+           32, 3, 3, 16,
+           1, 1,
+           6, 2,
+           1, 1,
+         ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          32, 24, 32, 32,
+          32, 1, 2, 32,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          4, 2, 3, 256,
+          328, 3, 5, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        return conv2d_default_sizes
+
+    # Add a few large and rigorous convolution problem sizes
+    def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
+        sizes = []
+        if False:
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 124, 224, 2 * minimum_channel_size),
+              (24, 7, 7, 2 * minimum_channel_size),
+            ))
+
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 233, 35, minimum_channel_size),
+              (24, 7, 5, minimum_channel_size),
+            ))
+        return sizes
+
+    # Add resent50 layers to unit testing sizes
+    def initialize_conv2d_resnet50_sizes(self, batch_size):
+        conv2d_problem_vector = []
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          256, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 3, 3, 64,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          64, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          128, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          128, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          512, 1, 1, 128,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          128, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          1024, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          256, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          256, 3, 3, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          1024, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          256, 1, 1, 1024,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          2048, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          512, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          512, 3, 3, 512,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          2048, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 2048,
+          512, 1, 1, 2048,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        return conv2d_problem_vector
+
+    def initialize_conv2d_grouped_sizes(self):
+        threadblock_n = 128
+        threadblock_k = 32
+
+        sizes = []
+        ##########################################
+        # One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+        # One CTA calculates a single group
+        ##########################################
+        for cta_per_group_k in range(1, 4):
+            for groups in range(2, 5):
+                conv_k = cta_per_group_k * threadblock_n * groups
+                sizes.append(Conv2DProblemSize(
+                  1, 8, 8, threadblock_k * 2 * groups,
+                  conv_k, 3, 3, threadblock_k * 2,
+                  1, 1,
+                  1, 1,
+                  1, 1,
+                  ConvMode.CrossCorrelation,
+                  1,
+                  groups
+                ))
+
+        # Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n * 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        sizes.append(Conv2DProblemSize(
+          1, 56, 56, 696,
+          768, 3, 3, 232,
+          1, 1,
+          2, 2,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+        sizes.append(Conv2DProblemSize(
+          1, 14, 14, 1392,
+          1536, 3, 3, 232,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+
+        ##########################################
+        # One CTA calculate multiple groups: CTA::N % k_per_group = 0
+        ##########################################
+
+        # 2 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 4,
+          threadblock_n, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 2 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 4 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 8,
+          threadblock_n // 2, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        # 4 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 2,
+          threadblock_n // 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        return sizes
--- a/test/python/cutlass/conv2d/conv2d_sm80.py
+++ b/test/python/cutlass/conv2d/conv2d_sm80.py
@ -0,0 +1,146 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for Conv2d opreations on SM80
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from conv2d_test_utils import *
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
+class Conv2dSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+conv_problems = get_conv_problems()
+
+
+# Tests for optimized & analytic
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    # F16, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
+    # F16, tensor op
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, analytic iterator
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
+    # F16, tensor op, f32 output
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, different tile description
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
+    # F32, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
+    # Tf32, tensorop
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 16],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
+    )
+    # Split-K
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
+        split_k_slices=2)
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
+        split_k_slices=5)
+    # Swizzling functor
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
+
+# Tests for few channels and fixed channels
+# F16, tensor op, few channels
+for c, tb, stage, inst in zip([2, 1],
+                                [[128, 128, 64], [128, 128, 32]],
+                                [3, 2],
+                                [[16, 8, 16], [16, 8, 8]]):
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=tb,
+        warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
+    )
+# F16, tensor op, fixed channels
+for c in [8, 4, 2]:
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
+    )
+
+# Test activations
+for activation in ["relu", "leaky_relu"]:
+    for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
+        add_test(
+            Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+            opclass="tensor_op", threadblock_shape=[128, 128, 64],
+            warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
+            split_k_slices=split_k_slices, activation=activation)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/conv2d/conv2d_test_utils.py
+++ b/test/python/cutlass/conv2d/conv2d_test_utils.py
@ -0,0 +1,425 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for Conv2d tests.
+"""
+
+import torch
+
+import cutlass
+from cutlass import (
+    ConvKind,
+    ConvMode,
+    DataType,
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+)
+from cutlass.backend.utils.software import SubstituteTemplate
+from cutlass.shape import Conv2DProblemSize
+from cutlass.utils.datatypes import numpy_type, torch_type
+
+from conv2d_problem_sizes import TestbedConv2dProblemSizes
+
+
+def get_name_conv2d(
+    arch,
+    conv_kind,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm,
+    swizzle,
+    split_k_mode,
+    split_k_slices,
+    activation
+):
+    """
+    Generates a procedural name for a test case for conv2d
+
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
+    :type conv_kind: str
+    :param iterator_algorithm: the iterator algorithm applied
+    :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param element_accumulator: data type used in accumulation
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param stride_support: stride support of dgrad
+    :param alignment: int
+    :type alignment: int
+
+    :return: str
+    """
+    if iterator_algorithm is None:
+        iterator_algorithm = "AUTO"
+    if swizzle is None:
+        swizzle = 1
+    name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
+
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "conv_kind": conv_kind,
+            "iter_alg": iterator_algorithm,
+            "eA": DataTypeNames[element],
+            "eB": DataTypeNames[element],
+            "eC": DataTypeNames[element_output],
+            "opclass": opclass,
+            "acc": DataTypeNames[element_accumulator],
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "wM": str(threadblock_shape[0] // warp_count[0]),
+            "wN": str(threadblock_shape[1] // warp_count[1]),
+            "wK": str(threadblock_shape[2] // warp_count[2]),
+            "IM": str(instruction_shape[0]),
+            "IN": str(instruction_shape[1]),
+            "IK": str(instruction_shape[2]),
+            "stages": str(stages),
+            "swizzle": str(swizzle),
+            "split_k_mode": split_k_mode,
+            "split_k_slices": str(split_k_slices),
+            "activation": activation
+        }
+    )
+
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        Conv2DProblemSize(
+            1, 8, 8, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            32, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 7, 7, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+
+
+class Conv2dLauncherFrontend:
+    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        self.layout_A = LayoutType.TensorNHWC
+        self.layout_B = LayoutType.TensorNHWC
+        self.layout_C = LayoutType.TensorNHWC
+        self.layout_D = LayoutType.TensorNHWC
+
+        self.element_compute = DataType.f32
+
+        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+            self.rand_max = 1
+        else:
+            self.rand_max = 4
+        self.activation = plan.activation
+
+    def uniform_init(self, size, dtype):
+        tensor = torch.ceil(
+            torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
+        ).to(memory_format=torch.channels_last)
+        return tensor
+
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.conv_kind == ConvKind.Fprop:
+            torch_result = alpha * torch.ops.aten.conv2d(
+                A,
+                B,
+                stride=(ps.stride_h, ps.stride_w),
+                padding=(ps.pad_h, ps.pad_w),
+                dilation=(ps.dilation_h, ps.dilation_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Dgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_input(
+                (ps.N, ps.C, ps.H, ps.W),
+                B,
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Wgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_weight(
+                B,
+                (ps.K, ps.C, ps.R, ps.S),
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+
+        if activation == cutlass.backend.epilogue.relu:
+            torch_result = torch.nn.functional.relu(torch_result)
+        elif activation == cutlass.backend.epilogue.leaky_relu:
+            torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+        return torch_result
+
+    def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        if self.conv_kind == ConvKind.Fprop:
+            tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+        elif self.conv_kind == ConvKind.Dgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+        elif self.conv_kind == ConvKind.Wgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+
+        torch.manual_seed(self.seed)
+
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
+        self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+
+        tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
+
+        torch.cuda.synchronize()
+        passed = torch.equal(tensor_D, tensor_D_ref)
+
+        return passed
+
+
+def add_test(
+    cls,
+    cc,
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+
+    def run(self):
+        # Create the plan
+        plan = cutlass.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass.epilogue, activation)
+
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
+
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices): continue
+
+            self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
+
+    setattr(cls, test_name, run)
+
+    return run
+
+
+def get_conv_problems():
+    # 64: minimum channel size
+    conv_problems = TestbedConv2dProblemSizes(64).all
+
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        Conv2DProblemSize(
+            1, 4, 4, 12,
+            8, 3, 3, 12,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 4, 4, 14,
+            8, 3, 3, 14,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 23, 56, 98,
+            128, 3, 3, 98,
+            4, 5,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return conv_problems
--- a/test/python/cutlass/conv2d/run_all_tests.py
+++ b/test/python/cutlass/conv2d/run_all_tests.py
@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
--- a/test/python/cutlass/emit/pytorch.py
+++ b/test/python/cutlass/emit/pytorch.py
@ -0,0 +1,308 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests emitting a CUTLASS kernel to a PyTorch CUDA extension
+"""
+
+import random
+import tempfile
+import unittest
+
+import cutlass
+
+if cutlass.utils.datatypes.torch_available:
+    import torch
+
+
+def _initialize(dtype, M: int, N: int, K: int):
+    """
+    Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K
+
+    :param dtype: data type of tensors
+    :param M: M dimension of GEMM problem
+    :type M: int
+    :param N: N dimension of GEMM problem
+    :type N: int
+    :param K: N dimension of GEMM problem
+    :type K: int
+
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    sizes = [(M, K), (K, N), (M, N), (M, N)]
+    return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]
+
+
+def _generate_problems(dtype, num):
+    """
+    Utility function to generate `num` GEMMs of random sizes
+
+    :param dtype: data type of tensors
+    :param num: number of GEMMs to generate
+    :type num: int
+
+    :return: lists of A, B, C, and D tensors
+    :rtype: list
+    """
+    valid_sizes = [128, 256, 512, 1024]
+    As, Bs, Cs, Ds = [], [], [], []
+    for _ in range(num):
+        M, N, K = [random.choice(valid_sizes) for _ in range(3)]
+        A, B, C, D = _initialize(dtype, M, N, K)
+        As.append(A)
+        Bs.append(B)
+        Cs.append(C)
+        Ds.append(D)
+    return As, Bs, Cs, Ds
+
+def _generate_conv2d_problem(conv_kind, dtype, ps):
+    """
+    Utility function to generate conv2d inputs
+    
+    :param conv_kind: kind of convolution
+    :type conv_kind: str
+    :param dtype: data type of tensors
+    :param problem_size: the conv2d problem size
+    :type problem_size: cutlass.shape.Conv2DProblemSize
+
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    if conv_kind == "fprop":
+        tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+    elif conv_kind == "dgrad":
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+    else:
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+    sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
+    return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
+
+
+@unittest.skipIf(not cutlass.utils.datatypes.torch_available, 'PyTorch must be available to run PyTorch extension tests')
+class PyTorchExtensionTest(unittest.TestCase):
+
+    def test_gemm(self):
+        random.seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass.op.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
+        plan.activation = cutlass.epilogue.relu
+        op = plan.construct()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name='gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        A, B, C, _ = _initialize(dtype, 1024, 256, 512)
+
+        D_ref = torch.nn.functional.relu(A @ B)
+        D = mod.run(A, B)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C, 1.0)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C, 1.0, 0.0)
+        assert torch.allclose(D, D_ref)
+
+        alpha = 2.0
+        beta = -1.0
+        D_ref = torch.nn.functional.relu((A @ B) * alpha + (beta * C))
+        D = mod.run(A, B, C, alpha, beta)
+        assert torch.allclose(D, D_ref)
+
+    def test_grouped_gemm(self):
+        random.seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
+        op = plan.construct()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name='grouped_gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        As, Bs, Cs, _ = _generate_problems(dtype, 50)
+
+        def check_all(X, Y):
+            for x, y in zip(X, Y):
+                assert torch.allclose(x, y)
+
+        Ds_ref = [a @ b for a, b in zip(As, Bs)]
+        Ds = mod.run(As, Bs)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs, 1.0)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs, 1.0, 0.0)
+        check_all(Ds, Ds_ref)
+
+        alpha = 2.0
+        beta = -1.0
+        Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
+        Ds = mod.run(As, Bs, Cs, alpha, beta)
+        check_all(Ds, Ds_ref)
+    
+    def test_conv2d_fprop(self):
+        torch.manual_seed(2023)
+        
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
+        plan.activation = "relu"
+        
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1
+        )
+        
+        A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        
+        D_ref = alpha * torch.ops.aten.conv2d(
+            A, B, stride=stride, padding=padding
+        ) + beta * C
+        D_ref = torch.nn.functional.relu(D_ref)
+        D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
+        
+        assert torch.allclose(D, D_ref) 
+        
+        # Test serial split-K
+        D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+        
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+        
+    
+    def test_conv2d_dgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
+        
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            cutlass.ConvMode.CrossCorrelation,
+            1, 1
+        )
+        
+        A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
+        D_ref = alpha * torch.nn.grad.conv2d_input(
+            input_size, B, A, 
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
+        
+        assert torch.allclose(D, D_ref) 
+    
+    def test_conv2d_wgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
+        
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            cutlass.ConvMode.CrossCorrelation,
+            1, 1
+        )
+        
+        A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
+        D_ref = alpha * torch.nn.grad.conv2d_weight(
+            B, weight_size, A, 
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
+        
+        assert torch.allclose(D, D_ref) 
+        
+        # Test serial split-K
+        D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+        
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_compute_sm80_90.py
+++ b/test/python/cutlass/evt/evt_compute_sm80_90.py
@ -0,0 +1,100 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for compute node in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass import swizzle
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTComputeSM90(EVTTestCaseBase):
+
+    def test_arith(self):
+        """
+        Test Arithmatic op
+        """
+        def evt_arith_compute(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_func_call(self):
+        """
+        Test Function call
+        """
+        def evt_func_call(accum, C, alpha, beta, gamma):
+            D = multiply_add(relu(accum + alpha) + C, beta, gamma)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_layout_sm80_90.py
+++ b/test/python/cutlass/evt/evt_layout_sm80_90.py
@ -0,0 +1,173 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTLayoutSM90(EVTTestCaseBase):
+
+    def test_permute_1(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D_permute = F_permute + permute(C, indices=(0, 2, 1))
+            D = permute(D_permute, indices=(0, 2, 1))
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
+    def test_permute_2(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, n, m)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, n, m)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
+    def test_permute_3(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(1, 0, 2))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (m, l, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (m, l, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_reshape(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            E_reshape = reshape(TensorE, new_shape=(512, 1))
+            D = F + E_reshape
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (16, 32)),
+            "D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+    def test_reshape2(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
+            D = F_reshape + TensorE
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
+            "D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_load_sm80_90.py
+++ b/test/python/cutlass/evt/evt_load_sm80_90.py
@ -0,0 +1,142 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for load nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTLoadSM90(EVTTestCaseBase):
+
+    def test_tensor_load(self):
+        """
+        Load extra tensor with shape [m, n]
+        """
+        def evt_tensor_load(accum, C, aux, aux_batch):
+            D = accum + C + aux + aux_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "aux_batch": self.fake_tensor(np.float32, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
+            input_keys = ["C", "aux", "aux_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_broadcast(self):
+        """
+        Load extra tensor with shape [1, n]
+        """
+        def evt_row_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (n,)),
+                "bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_column_broadcast(self):
+        """
+        Load extra tensor with shape [m, 1]
+        """
+        def evt_column_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m, 1)),
+                "bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_broadcast(self):
+        """
+        Load extra tensor with shape [1, 1]
+        """
+        def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
+            D = accum + C + alpha + alpha_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
+            input_keys = ["C", "alpha", "alpha_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_mixed_sm80_90.py
+++ b/test/python/cutlass/evt/evt_mixed_sm80_90.py
@ -0,0 +1,274 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unittest for mixed types of nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass.swizzle import ThreadblockSwizzleStreamK
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTMixedSM90(EVTTestCaseBase):
+    def test_mixed_dag(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        if device_cc() == 80:
+            aligments = [2, 4, 8]
+        else:
+            # Sm90 EVT currently only supports 128-bit alignment
+            aligments = [8,]
+        for align in aligments:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(self.element, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(self.element, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(self.element, (l, m, n)),
+                    "cbias": self.fake_tensor(self.element, (m, 1)),
+                    "rbias": self.fake_tensor(self.element, (n,)),
+                    "D": self.fake_tensor(self.element, (l, m, n)),
+                    "F": self.fake_tensor(self.element, (l, m, n)),
+                    "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                    "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                }
+
+                launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_float(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for align in [3, 2, 4]:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(np.float32, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(np.float32, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(np.float32, (l, m, n)),
+                    "cbias": self.fake_tensor(np.float32, (m, 1)),
+                    "rbias": self.fake_tensor(np.float32, (n,)),
+                    "D": self.fake_tensor(np.float32, (l, m, n)),
+                    "F": self.fake_tensor(np.float32, (l, m, n)),
+                    "F_row_max": self.fake_tensor(np.float32, (n,)),
+                    "E_col_max": self.fake_tensor(np.float32, (m, 1))
+                }
+                launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_stage2(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_partition_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            tile_description = {
+                "threadblock_shape": [128, 128, 64],
+                "warp_count": [2, 2, 2]
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_stream_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        # High per-sm occupancy tile_description
+        tile_description = {
+            "threadblock_shape": [128, 128, 32],
+            "warp_count": [2, 2, 1],
+            "stages": 3
+        }
+        tds = [None, tile_description]
+        for td in tds:
+            for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
+                if l == 1:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (m, n)),
+                        "F": self.fake_tensor(self.element, (m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                else:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (l, m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (l, m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (l, m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (l, m, n)),
+                        "F": self.fake_tensor(self.element, (l, m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+
+                if td is not None:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        tile_description=td,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                else:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_mixed_dag_no_batch(self):
+        def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, _ in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (m, n)),
+                "F": self.fake_tensor(self.element, (m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, 1)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_store_sm80_90.py
+++ b/test/python/cutlass/evt/evt_store_sm80_90.py
@ -0,0 +1,155 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTStoreSM90(EVTTestCaseBase):
+
+    def test_aux_store(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_aux_store(accum, alpha, C):
+            F = alpha * accum
+            D = F + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_col_reduce(self):
+        """
+        Reduction [m, n] -> [m, 1]
+        """
+        def evt_row_reduce(accum, alpha, C):
+            acc_row_max = max(accum, dim=[2,])
+            F = alpha * accum
+            F_row_max = max(F, dim=[0, 2])
+            D = F + C
+            return D, F_row_max, acc_row_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(np.float32, (m, 1)),
+                "acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_row_max", "acc_row_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_reduce(self):
+        """
+        Reduction [m, n] -> [n]
+        """
+        def evt_col_reduce(accum, alpha, C):
+            acc_col_max = max(accum, dim=[1,])
+            F = alpha * accum
+            F_col_max = max(F, dim=[0, 1])
+            D = F + C
+            return D, F_col_max, acc_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_col_max": self.fake_tensor(np.float32, (n,)),
+                "acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_col_max", "acc_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_reduce(self):
+        """
+        Reduction [m, n] -> [1,]
+        """
+        def evt_scalar_reduce(accum, alpha, C):
+            acc_max = max(accum, dim=[1, 2])
+            F = alpha * accum
+            F_max = max(F, dim=[0, 1, 2])
+            D = F + C
+            return D, F_max, acc_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
+                "F_max": self.fake_tensor(np.float32, (1,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_max", "acc_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/run_all_tests.py
+++ b/test/python/cutlass/evt/run_all_tests.py
@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'evt_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
--- a/test/python/cutlass/evt/utils/evt_testbed.py
+++ b/test/python/cutlass/evt/utils/evt_testbed.py
@ -0,0 +1,230 @@
+################################################################################
+#
+# Copyright (c) 20123 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Testbed classes of EVT
+"""
+
+import torch
+import unittest
+
+import cutlass
+from cutlass import Tensor
+import cutlass.backend.evt
+from cutlass.profiler import CUDAEventProfiler
+from cutlass.shape import GemmCoord
+from cutlass.utils.datatypes import torch_type
+
+
+class EVTReferenceModule:
+    def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.epilogue_visitor = epilogue_visitor
+
+    def run(self, A, B, C, problem_size, alpha, beta, batch=1):
+        if self.layout_A == cutlass.LayoutType.RowMajor:
+            A_row = A.view((batch, problem_size.m, problem_size.k))
+        else:
+            A_col = A.view((batch, problem_size.k, problem_size.m))
+            A_row = torch.permute(A_col, (0, 2, 1))
+
+        if self.layout_B == cutlass.LayoutType.RowMajor:
+            B_row = B.view((batch, problem_size.k, problem_size.n))
+        else:
+            B_col = B.view((batch, problem_size.n, problem_size.k))
+            B_row = torch.permute(B_col, (0, 2, 1))
+
+        if self.layout_C == cutlass.LayoutType.RowMajor:
+            C_row = C.view((batch, problem_size.m, problem_size.n))
+        else:
+            C_col = C.view((batch, problem_size.n, problem_size.m))
+            C_row = torch.permute(C_col, (0, 2, 1))
+
+        out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
+
+        if self.layout_C == cutlass.LayoutType.ColumnMajor:
+            out = torch.permute(out_row, (0, 2, 1))
+        else:
+            out = out_row
+
+        return torch.flatten(out)
+
+    def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
+        # Running the mainloop
+        accum = self.run(
+            A, B, C, problem_size, 1.0, 0.0, batch=batch
+        ).reshape(batch, problem_size.m, problem_size.n)
+        
+        # Running the epilogue
+        epilogue_args["accum"] = accum
+        references = self.epilogue_visitor(**epilogue_args)
+        
+        # Return the results
+        if not isinstance(references, tuple):
+            references = (references,)
+        return references
+        
+
+class EVTTestBed:
+    """
+    Epilogue Visitor Testbed
+    """
+    def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
+        self.element = element
+        layout = cutlass.LayoutType.RowMajor
+        self.example_inputs = example_inputs
+        
+        # Create the Gemm plan
+        self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
+        
+        if "tile_description" in kwargs:
+            self.plan.tile_description = kwargs["tile_description"]
+        
+        if "swizzling_functor" in kwargs:
+            self.plan.swizzling_functor = kwargs["swizzling_functor"]
+        
+        # Compile the epilogue visitor
+        epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
+        if "epilogue_stages" in kwargs:
+            epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
+        self.plan.epilogue_visitor = epilogue_visitor
+        
+        # Reference model
+        self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
+        
+        self.profile = profile
+
+    def get_torch_tensor(self, shape, dtype=None, fill=None):
+        if dtype is None:
+            dtype = self.element
+        
+        dtype = torch_type(dtype)
+        if fill is None:
+            return torch.ceil(
+                torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
+            )
+        else:
+            return torch.full(shape, fill, dtype=dtype, device="cuda")
+    
+    def verify(self, problem_size, input_keys, result_keys, batch_count=1):
+        """
+        Verify the results
+        """
+        problem_size = GemmCoord(*problem_size)
+
+        # Initiate the GEMM arguments
+        tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
+        tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
+        
+        # Initialize the epilogue args
+        epilogue_args = {}
+        for key in self.example_inputs.keys():
+            if key in input_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
+                else:
+                    epilogue_args[key] = tensor
+            elif key in result_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    if "max" in key:
+                        fill = -1000
+                    else:
+                        fill = 0
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
+                else:
+                    epilogue_args[key] = tensor
+        
+        tensor_D = epilogue_args["D"]
+        if "C" in epilogue_args:
+            tensor_C = epilogue_args["C"]
+        else:
+            tensor_C = tensor_D
+        # Run the device kernel
+        self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
+        
+        # Run the host reference
+        evt_args_inputs = {}
+        for key in input_keys:
+            evt_args_inputs[key] = epilogue_args[key]
+        
+        reference_results = self.reference_fn(
+            tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
+        
+        # Compare the results
+        for result, ref in zip(result_keys, reference_results):
+            assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
+        
+        # Run profile
+        if self.profile:
+            profiler = CUDAEventProfiler(
+                self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
+                visitor_args = epilogue_args
+            )
+            print(f"Cutlass Python Duration: {profiler()}")
+
+
+class EVTTestCaseBase(unittest.TestCase):
+    """
+    Base class for EVT Unittest
+    """
+    def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
+        super().__init__(methodName)
+        
+        self.element = cutlass.DataType.f16
+        self.l, self.m, self.n, self.k = lmnk
+        
+        self.problem_size = (self.m, self.n, self.k)
+        
+        torch.random.manual_seed(42)
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
+    
+    def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
+        k = k if k else self.k
+        problem_size_m = [alignment, 512 - 3 * alignment]
+        problem_size_n = [alignment, 512 - alignment]
+        if alignment % 8 == 0:
+            problem_size_m.append(768)
+            problem_size_n.append(768)
+        problem_size_l = batch_count
+        problem_sizes = []
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for l in problem_size_l:
+                    problem_sizes.append((m, n, k, l))
+        
+        return problem_sizes
--- a/test/python/cutlass/gemm/gemm_batched.py
+++ b/test/python/cutlass/gemm/gemm_batched.py
@ -0,0 +1,134 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+High-level tests for running batched GEMMs
+"""
+
+from functools import partial
+import logging
+from math import prod
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+import torch
+
+from utils import LayoutCombination, add_test_gemm
+
+cutlass.set_log_level(logging.WARNING)
+
+torch.manual_seed(2023)
+
+
+def pytorch_reference(A, B, C, alpha, beta):
+    # Get the batch count. Assume that any of A, B, and C
+    # with a batch dimension ahve matching batch count. Thus,
+    # we break out of the loop once we have found the first
+    # tensor containing a batch dimension.
+    batch_count = (1,)
+    for tensor in [A, B, C]:
+        if len(tensor.shape) > 2:
+            batch_count = tensor.shape[:-2]
+            break
+
+    int_batch_count = prod(batch_count)
+
+    def add_batch(tensor):
+        if len(tensor.shape) == 2:
+            return tensor.unsqueeze(0).repeat(int_batch_count, 1, 1)
+        else:
+            return tensor.reshape(-1, tensor.size(-2), tensor.size(-1))
+
+    # Reshape tensors to have batch dimension
+    A = add_batch(A)
+    B = add_batch(B)
+    C = add_batch(C)
+
+    ret = (torch.bmm(A, B) * alpha) + (C * beta)
+    reshape_vals = batch_count + C.shape[-2:]
+    return ret.reshape(*reshape_vals)
+
+
+def initialize(rows, cols, batch):
+    tensor = torch.randint(-3, 3, size=(rows*cols*prod(batch),), device='cuda').half()
+    if len(batch) > 0 and prod(batch) > 1:
+        reshape_vals = batch + (rows, cols)
+        return tensor.reshape(*reshape_vals)
+    else:
+        return tensor.reshape(rows, cols)
+
+
+class GemmF16Batched(unittest.TestCase):
+    def run_batched(self, batch_count: tuple, batch_A: bool, batch_B: bool, batch_C: bool):
+        M = 512
+        N = 256
+        K = 128
+        alpha = 1.
+        beta = 2.
+
+        A = initialize(M, K, batch_count if batch_A else (1,))
+        B = initialize(K, N, batch_count if batch_B else (1,))
+        C = initialize(M, N, batch_count if batch_C else (1,))
+        D = initialize(M, N, batch_count)
+
+        plan = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=cutlass.DataType.f32)
+        plan.run(A, B, C, D, alpha, beta)
+        reference = pytorch_reference(A, B, C, alpha, beta)
+        assert reference.equal(D)
+
+    def test_batched_ABC(self):
+        self.run_batched((3,), True, True, True)
+        self.run_batched((2, 3), True, True, True)
+
+    def test_batched_AB(self):
+        self.run_batched((3,), True, True, False)
+        self.run_batched((2, 3), True, True, False)
+
+    def test_batched_AC(self):
+        self.run_batched((3,), True, False, True)
+        self.run_batched((2, 3), True, False, True)
+
+    def test_batched_BC(self):
+        self.run_batched((3,), False, True, True)
+        self.run_batched((2, 3), False, True, True)
+
+    def test_batched_A(self):
+        self.run_batched((3,), True, False, False)
+        self.run_batched((2, 3), True, False, False)
+
+    def test_batched_B(self):
+        self.run_batched((3,), False, True, False)
+        self.run_batched((2, 3), False, True, False)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_f16_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm80.py
@ -0,0 +1,125 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F16 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmF16Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmF16Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f16, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 32], warp_count=[2, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_f16_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm90.py
@ -0,0 +1,140 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F16 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 90
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+class GemmF16Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=cutlass.DataType.f16,
+                               warp_count=None, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
+add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
+add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], stages=5)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+
+# Tests with different cluster shapes
+add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                       element_accumulator=cutlass.DataType.f16, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 1, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 2, 1])
+
+# Tests for different schedule modes
+add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
+                            element_output=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
+                            opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
+add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8])
+add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8])
+add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8])
+add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8])
+add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_f32_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f32_sm80.py
@ -0,0 +1,100 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F32 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmF32Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmF32Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f32, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 32], warp_count=[1, 1, 1], stages=4)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_f64_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm80.py
@ -0,0 +1,99 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F64 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmF64Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmF64Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f64, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                  element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                  element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64,  64, 16], warp_count=[2, 2, 1], stages=4)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                  element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 32,  32, 16], warp_count=[2, 1, 1], stages=5)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                 element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_f64_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm90.py
@ -0,0 +1,69 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F64 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 90
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+class GemmF64Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
+                               element=cutlass.DataType.f64, element_output=cutlass.DataType.f64,
+                               element_accumulator=cutlass.DataType.f64, compilation_modes=['nvcc'])
+
+add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(    opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128,  8], stages=2)
+add_test_specialized(    opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128,  8], stages=2)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_s8_sm80.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm80.py
@ -0,0 +1,99 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmS8Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+class GemmS8Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.s8, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16],  element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16],  element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16,  4], element_output=cutlass.DataType.s32,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=4)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                 element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_s8_sm90.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm90.py
@ -0,0 +1,95 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 90
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+class GemmS8Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=cutlass.DataType.s8, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16,  8], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64,  128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128,  64,  32], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4,  4, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with different cluster shapes
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)
+
+# Tests for SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/gemm/gemm_testbed.py
+++ b/test/python/cutlass/gemm/gemm_testbed.py
@ -0,0 +1,387 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from math import prod
+import os
+import re
+import subprocess
+
+import torch
+
+from cutlass import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    LayoutType,
+    OpcodeClass,
+    ShortDataTypeNames,
+    SwizzlingFunctor
+)
+
+from cutlass.backend import compiler
+from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass.backend.memory_manager import get_allocated_size
+from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
+from cutlass.shape import GemmCoord, MatrixCoord
+from cutlass.utils.datatypes import torch_type
+
+
+class GemmUniversalLauncher:
+    def __init__(
+        self,
+        operation,
+        seed=2080,
+        verification=True,
+        iterations=500,
+        compiler_mode= "nvcc",
+        **kwargs,
+    ) -> None:
+        # Create the reduction kernel, if needed
+        self.reduction_operation: ReductionOperation = ReductionOperation(
+            shape=MatrixCoord(4, 32 * operation.C.alignment),
+            C=operation.C,
+            element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+            element_compute=operation.epilogue_functor.element_epilogue,
+            epilogue_functor=operation.epilogue_functor,
+            count=operation.C.alignment,
+        )
+
+        self.math_operation = operation.tile_description.math_instruction.math_operation
+        self.verification = verification
+
+        if compiler_mode == "nvcc":
+            compiler.nvcc()
+        elif compiler_mode == "nvrtc":
+            compiler.nvrtc()
+        else:
+            raise Exception(f"Unexpected compiler string {compiler_mode}")
+
+        op_list = [operation]
+        if operation.arch < 90:
+            # Split K via Python is currently only supported for pre-SM90 kernels
+            op_list.append(self.reduction_operation)
+
+        compiler.add_module(op_list, bypass_cache=False)
+
+        self.operation = operation
+
+        self.dtype_A = torch_type(operation.A.element)
+        self.dtype_B = torch_type(operation.B.element)
+        self.dtype_C = torch_type(operation.C.element)
+        self.dtype_D = torch_type(operation.C.element)
+
+        accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
+        element_size = DataTypeSize[operation.A.element]
+
+        if element_size == 1:
+            self.rand_max = 1
+            self.rand_min = 0
+        elif element_size <= 8:
+            self.rand_max = 1
+            self.rand_min = -1
+        elif element_size == 16:
+            self.rand_max = 4
+            self.rand_min = -4
+        else:
+            self.rand_max = 8
+            self.rand_min = -8
+
+        self.seed = seed
+
+        self.compute_type = operation.epilogue_functor.element_epilogue
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+
+    def print_problem_size(self, p, mode, batch_count):
+        if mode == GemmUniversalMode.Gemm:
+            mode = "Gemm"
+        elif mode == GemmUniversalMode.Batched:
+            mode = "GemmBatched"
+        elif mode == GemmUniversalMode.GemmSplitKParallel:
+            mode = "GemmSplitKParallel"
+        print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
+
+    def uniform_init(self, shape, dtype, layout):
+        size = prod(shape)
+        if dtype.is_floating_point:
+            data = torch.ceil(torch.empty(size=(size,), dtype=dtype, device="cuda").uniform_(self.rand_min - 0.5, self.rand_max - 0.5))
+        else:
+            # PyTorch does not currently support integer-typed matrix multiplications on GPU.
+            # Fall back to CPU for integer type references.
+            data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
+
+        if dtype == torch.float64 or dtype == torch.float32:
+            data = data.to("cpu")
+
+        data_ref = data.reshape(shape)
+
+        if layout == LayoutType.RowMajor:
+            data_cutlass = data_ref
+        else:
+            data_cutlass = data_ref.transpose(-1, -2).contiguous()
+
+        data_cutlass = data_cutlass.to("cuda")
+        return data_cutlass, data_ref
+
+    def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        # If any tensor is on CPU, place all tensors on CPU unless only
+        # tensor C is on CPU
+        devices = [x.device.type for x in [tensor_A, tensor_B, tensor_C]]
+        if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
+            device = torch.device("cpu")
+        else:
+            device = tensor_A.device
+
+        tensor_A = tensor_A.to(device)
+        tensor_B = tensor_B.to(device)
+        tensor_C = tensor_C.to(device)
+
+        dtype = torch_type(self.compute_type)
+        alpha_torch = torch.tensor([alpha], device=device).to(dtype)
+        beta_torch = torch.tensor([beta], device=device).to(dtype)
+
+        tmp = tensor_A @ tensor_B
+        tensor_D_ref = (alpha_torch * tmp) + (tensor_C * beta_torch)
+        return tensor_D_ref.to(self.dtype_D)
+
+    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
+        torch.random.manual_seed(self.seed)
+
+        # Assign an actual batch count in cases where we are not running in batched mode.
+        # This is to differentiate between the number of split K slices and the batch count,
+        # which are overloaded within the single `batch_count` variable.
+        if mode == GemmUniversalMode.Batched:
+            true_batch_count = batch_count
+        else:
+            true_batch_count = 1
+
+        def transpose(layout):
+            if layout == LayoutType.RowMajor:
+                return LayoutType.ColumnMajor
+            else:
+                return LayoutType.RowMajor
+
+        tensor_A, tensor_A_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.k),
+            self.dtype_A,
+            self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
+        )
+        tensor_B, tensor_B_ref = self.uniform_init(
+            (true_batch_count, problem_size.k, problem_size.n),
+            self.dtype_B,
+            self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
+        )
+        tensor_C, tensor_C_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.n),
+            self.dtype_C,
+            self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+        )
+        tensor_D = torch.zeros_like(tensor_C)
+
+        if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+            alpha = int(alpha)
+            beta = int(beta)
+
+        #
+        # Launch kernel
+        #
+
+        arguments = GemmArguments(
+            operation=self.operation,
+            problem_size=problem_size,
+            A=tensor_A,
+            B=tensor_B,
+            C=tensor_C,
+            D=tensor_D,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            gemm_mode=mode,
+            split_k_slices=split_k_slices,
+            batch=batch_count,
+        )
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[problem_size.m, problem_size.n],
+                partitions=split_k_slices,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op=self.reduction_operation.epilogue_type(alpha, beta),
+            )
+
+        self.operation.run(arguments)
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            self.reduction_operation.run(reduction_arguments)
+
+        passed = True
+
+        if self.verification:
+            if mode == GemmUniversalMode.GemmSplitKParallel:
+                reduction_arguments.sync()
+            else:
+                arguments.sync()
+            tensor_D_ref = self.reference(
+                problem_size,
+                tensor_A_ref,
+                tensor_B_ref,
+                tensor_C_ref,
+                alpha,
+                beta,
+            )
+
+            tensor_D_ref = tensor_D_ref.to('cuda')
+
+            if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
+                tensor_D = tensor_D.transpose(-1, -2).contiguous()
+
+            passed = tensor_D.equal(tensor_D_ref)
+
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, mode, batch_count)
+        del arguments
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            del reduction_arguments
+
+        cur_size = get_allocated_size()
+        assert cur_size == 0, f"{cur_size} B of memory were not released after this run"
+
+        return passed
+
+
+def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
+    passed = True
+
+    minimum_operand_element_size = min(
+        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
+    )
+    opcode_class = operation.tile_description.math_instruction.opcode_class
+
+    if opcode_class == OpcodeClass.Simt:
+        alignment = 1
+    else:
+        alignment = 128 // minimum_operand_element_size
+
+    alignment_m = alignment
+    alignment_n = alignment
+    alignment_k = alignment
+
+    # INT8 alignment constraints
+    if opcode_class == OpcodeClass.Simt:
+        A_is_s8 = operation.A.element == DataType.s8
+        B_is_s8 = operation.B.element == DataType.s8
+
+        if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
+            alignment_m = 4
+        if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
+            alignment_n = 4
+        if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
+            alignment_k = 4
+
+    threadblock_k = operation.tile_description.threadblock_shape[2]
+
+    assert testcase != "interleaved"
+
+    supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
+
+    if testcase == "multistage":
+        modes = [GemmUniversalMode.Gemm]
+        problem_size_m = [16, 528]
+        problem_size_n = [16, 528]
+        problem_size_k = [
+            threadblock_k,
+            threadblock_k * operation.tile_description.stages
+            + operation.tile_description.math_instruction.instruction_shape[2],
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1]
+    else:
+        modes = [GemmUniversalMode.Gemm]
+        batch_counts = [1, 2, 3, 5, 7]
+        if supports_split_k:
+            modes.append(GemmUniversalMode.GemmSplitKParallel)
+
+        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
+        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
+        if operation.tile_description.stages is None:
+            stages_for_k_calc = 7
+        else:
+            stages_for_k_calc = operation.tile_description.stages
+        problem_size_k = [
+            alignment_k,
+            threadblock_k * stages_for_k_calc - alignment_k,
+            threadblock_k * stages_for_k_calc * 3 - alignment_k,
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [2.0]
+
+    testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
+
+    for mode in modes:
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for k in problem_size_k:
+                    for batch_count in batch_counts:
+                        for alpha in problem_alpha:
+                            for beta in problem_beta:
+                                # skip very small K problems
+                                if testcase == "universal":
+                                    if k // batch_count < 2 * threadblock_k:
+                                        continue
+
+                                problem_size = GemmCoord(m, n, k)
+
+                                if supports_split_k:
+                                    split_k_slices = batch_count
+                                else:
+                                    split_k_slices = 1
+
+                                overridden_mode = mode
+                                if mode == GemmUniversalMode.Gemm and batch_count > 1:
+                                    overridden_mode = GemmUniversalMode.Batched
+
+                                passed = testbed.run(
+                                    overridden_mode,
+                                    problem_size,
+                                    batch_count,
+                                    split_k_slices,
+                                    alpha,
+                                    beta,
+                                )
+
+                                if not passed:
+                                    return False
+
+    return passed
--- a/test/python/cutlass/gemm/run_all_tests.py
+++ b/test/python/cutlass/gemm/run_all_tests.py
@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'gemm_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
--- a/test/python/cutlass/gemm/utils.py
+++ b/test/python/cutlass/gemm/utils.py
@ -0,0 +1,239 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import cutlass
+
+from cutlass import (
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames
+)
+from cutlass.backend import library
+from cutlass.backend.utils.software import SubstituteTemplate
+
+from gemm_testbed import test_all_gemm
+
+
+class Layout:
+    """
+    Utility class to map transpose and non-transpose terminology to row- and column-major terminology
+    """
+
+    T = LayoutType.RowMajor
+    N = LayoutType.ColumnMajor
+
+
+class LayoutCombination:
+    """
+    Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
+    """
+
+    NNN = (Layout.N, Layout.N, Layout.N)
+    NNT = (Layout.N, Layout.N, Layout.T)
+    NTN = (Layout.N, Layout.T, Layout.N)
+    NTT = (Layout.N, Layout.T, Layout.T)
+    TNN = (Layout.T, Layout.N, Layout.N)
+    TNT = (Layout.T, Layout.N, Layout.T)
+    TTN = (Layout.T, Layout.T, Layout.N)
+    TTT = (Layout.T, Layout.T, Layout.T)
+
+
+def get_name(
+    layouts,
+    alignments,
+    element_output,
+    element_accumulator,
+    element_epilogue,
+    cluster_shape,
+    threadblock_shape,
+    stages,
+    element_a,
+    element_b,
+    arch,
+    opclass,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    suffix="",
+):
+    """
+    Generates a procedural name for a test case.
+
+    :param layouts: indexable container of layouts of A, B, and C operands
+    :param alignments: indexable container of alignments of A, B, and C operands
+    :param element_output: data type of the output element
+    :param element_accumulator: data type used in accumulation
+    :param element_epilogue: data type used in computing the epilogue
+    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param kernel_schedule: kernel_schedule type
+    :type kernel_schedule: cutlass.KernelScheduleType
+    :param epilogue_schedule: epilogue_schedule type
+    :type epilogue_schedule: cutlass.EpilogueScheduleType
+    :param suffix: additional string to add to the suffix of the name
+    :type suffix: str
+
+    :return: str
+    """
+    name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "eA": DataTypeNames[element_a],
+            "eB": DataTypeNames[element_b],
+            "eC": DataTypeNames[element_output],
+            "lA": ShortLayoutTypeNames[layouts[0]],
+            "lB": ShortLayoutTypeNames[layouts[1]],
+            "lC": ShortLayoutTypeNames[layouts[2]],
+            "opclass": OpcodeClassNames[opclass],
+            "acc": DataTypeNames[element_accumulator],
+            "cM": str(cluster_shape[0]),
+            "cN": str(cluster_shape[1]),
+            "cK": str(cluster_shape[2]),
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "stages": str(stages) if stages is not None else "auto",
+            "aA": str(alignments[0]),
+            "aB": str(alignments[1]),
+            "aC": str(alignments[2]),
+            "k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
+            "e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
+            "suffix": "" if suffix is None else suffix,
+        },
+    )
+
+
+def add_test_gemm(
+    cls=None,
+    cc=None,
+    element=None,
+    layouts=None,
+    alignments=None,
+    element_output=None,
+    element_accumulator=None,
+    cluster_shape=None,
+    threadblock_shape=None,
+    warp_count=None,
+    stages=None,
+    opclass=None,
+    swizzle=None,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    compilation_modes=['nvcc', 'nvrtc']):
+    """
+    Create test-running functions with the given specification and set it as a method of ``cls``.
+
+    :param cls: class to which the generated method will be added
+    :type cls: type
+    :param cc: compute capability to compile for
+    :type cc: int
+    :param element: data type of A and B operands
+    :type element: cutlass.DataType.f16
+    :param layouts: layouts of A, B, and C operands
+    :type layouts: list or tuple
+    :param alignments: alingments of A, B, and C operands
+    :type alignments: list or tuple
+    :param element_output: data type of the output element
+    :type element_output: cutlass.DataType
+    :param element_accumulator: data type used in accumulation
+    :type element_accumulator: cutlass.DataType
+    :param cluster_shape: dimensions of clusters
+    :type cluster_shape: list or tuple
+    :param threadblock_shape: dimensions of threadblock tiles
+    :type threadblock_shape: list or tuple
+    :param warp_count: warps to be launched per threadblock dimension
+    :type warp_count: list or tuple
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param swizzle: threadblock swizzling functor
+    :param kernel_schedule: kernel schedule to use
+    :type kernel_schedule: cutlass.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule to use
+    :type epilogue_schedule: cutlass.EpilogueScheduleType
+    :param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
+    :type compilation_modes: list
+    """
+
+    for compilation_mode in compilation_modes:
+        def run(self):
+            """
+            Dynamically-generated function that constructs a GEMM operation and verifies it against
+            multiple test cases.
+            """
+            element_A = element
+            element_B = element
+            layout_A, layout_B, layout_C = layouts
+            alignment_A, alignment_B, alignment_C = alignments
+
+            plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
+                                element_C=element_output, element_D=element_output,
+                                layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+                                element_accumulator=element_accumulator,
+                                kernel_cc=cc)
+
+            plan.opclass = opclass
+            if swizzle is not None:
+                plan.swizzling_functor = swizzle
+
+            td = plan.tile_descriptions()[0]
+
+            if warp_count is not None:
+                td.warp_count = warp_count
+            td.threadblock_shape = threadblock_shape
+            td.stages = stages
+            td.cluster_shape = cluster_shape
+            op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+            self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
+
+        element_epilogue = element_accumulator
+        name = get_name(
+            layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
+            element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
+            stages=stages, element_a=element, element_b=element, arch=cc, opclass=opclass,
+            kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
+
+        setattr(cls, name, run)
--- a/test/python/cutlass/interface/conv2d_interface.py
+++ b/test/python/cutlass/interface/conv2d_interface.py
@ -0,0 +1,284 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests the high-level Conv2d interface
+"""
+
+from math import ceil
+import unittest
+
+import cutlass
+import cutlass.utils.datatypes as datatypes
+from cutlass.backend.utils.device import device_cc
+from utils import ExpectException
+import os
+
+
+class Conv2dEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Conv2d interface
+    """
+    def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
+                 alignment_A, alignment_B, alignment_C):
+        
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+        
+        self.conv_kind = conv_kind
+        
+        self.plan = cutlass.op.Conv2d(
+            kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
+            element_D=element_D, element_accumulator=element_accumulator)
+        
+        self.op = self.plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B, 
+            alignment_C=self.alignment_C)
+    
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+        
+        :param other_plan: plan to compare against the default Conv2d
+        :type other_plan: cutlass.op.Conv2d
+
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B, 
+            alignment_C=self.alignment_C)
+        
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
+        and layouts for constructing the Conv2d interface
+        """
+        if not datatypes.numpy_available:
+            return
+        
+        # Test when specifying all parameters
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator)
+        assert self._plans_equal(plan_other)
+        
+        # Test when specifying all parameters but A
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+        
+        # Test when specifying all parameters but A and B as tensors using generic element and output
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+        
+        # Test without explicit accumulator. Only run if the type of C and the accumulator are equal
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass.op.Conv2d(
+                kind=self.conv_kind,
+                element_C=self.element_C,
+                element_D=self.element_D,
+                element=self.element_A)
+            assert self._plans_equal(plan_other)
+        
+        # Test with only the generic types. Only rune if the types of A, B, C, and D are the same
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator):
+            plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
+            assert self._plans_equal(plan_other)
+    
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
+        """
+        if not datatypes.numpy_available:
+            return
+
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+        
+        size = (2, 2)
+        A = np.zeros(size, dtype=type_A)
+        B = np.zeros(size, dtype=type_B)
+        C = np.zeros(size, dtype=type_C)
+        D = np.zeros(size, dtype=type_D)
+
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+    
+    def torch_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
+        """
+        if not datatypes.torch_available:
+            return
+        
+        import torch
+        type_A = datatypes.torch_type(self.element_A)
+        type_B = datatypes.torch_type(self.element_B)
+        type_C = datatypes.torch_type(self.element_C)
+        type_D = datatypes.torch_type(self.element_D)
+        type_accum = datatypes.torch_type(self.element_accumulator)
+        
+        size = (2, 2)
+        
+        A = torch.empty(size, dtype=type_A)
+        B = torch.empty(size, dtype=type_B)
+        C = torch.empty(size, dtype=type_C)
+        D = torch.empty(size, dtype=type_D)
+        
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+    
+    def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+        
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
+        assert self._plans_equal(plan_np)
+        
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        if type_A == type_B:
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
+            assert self._plans_equal(plan_np)
+        
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+        
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
+            assert self._plans_equal(plan_np)
+
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+        self.torch_test()
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class ConvEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Conv2d interface
+    """
+    pass
+
+type2alignment = {
+    cutlass.DataType.f16: 8,
+    cutlass.DataType.f32: 4
+}
+
+def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
+    
+    test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
+    
+    def run(self):
+        conv2d_eq = Conv2dEquivalence(
+            conv_kind=conv_kind, 
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            element_accumulator=element_accumulator, 
+            alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
+            alignment_C=type2alignment[element_C]
+        )
+        conv2d_eq.test_all()
+    
+    setattr(ConvEquivalenceTest, test_name, run)
+
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    for types in [
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f16],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32],
+        [cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32]
+    ]:
+        add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class Conv2dErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+    
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
+        
+        with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
+    
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
+        
+        td = plan.tile_descriptions()[0]
+        td.threadblock_shape=[17, 32, 5]
+        
+        plan.tile_description = td
+        with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
+            plan.compile()
+        # Clean up the error message
+        os.remove("./cutlass_python_compilation_device_error.txt")
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/interface/evt_interface.py
+++ b/test/python/cutlass/interface/evt_interface.py
@ -0,0 +1,245 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Test the EVT interface
+"""
+
+import numpy as np
+import unittest
+
+import cutlass
+from cutlass import LayoutType, Tensor
+from cutlass.backend.utils.device import device_cc
+from cutlass.epilogue import reshape, permute
+
+from utils import ExpectException
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class EVTErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the EVT interface
+    """
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
+    def test_root_not_d(self):
+        """
+        Test when "D" does not exist in Sm90 EVT
+        """
+        def evt_root_not_d(accum, alpha):
+            F = accum * alpha
+            return F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(device_cc() == 90, 
+            "SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
+            "but the variable 'D' is not found in the return values.", True):
+            
+            cutlass.epilogue.trace(evt_root_not_d, example_tensors)
+
+    def test_no_accum(self):
+        """
+        Test when "accum" is not in input arguments
+        """
+        def evt_no_accum(alpha, C):
+            D = alpha * C
+            return D
+        
+        example_tensors = {
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
+            cutlass.epilogue.trace(evt_no_accum, example_tensors)
+    
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
+    def test_too_much_shared_memory(self):
+        """
+        Test when the epilogue consumes too much shared memory
+        """
+        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
+            D1 = accum + C1
+            D2 = D1 + C2
+            D3 = D2 + C3
+            D4 = D3 + C4
+            D = D4 + C5
+            return D, D1, D2, D3, D4
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
+        
+        plan = cutlass.op.Gemm(
+            element=np.float16, layout=cutlass.LayoutType.RowMajor,
+            element_accumulator=np.float32
+        )
+        
+        with ExpectException(True, 
+            "RuntimeError: The epilogue consumes too much shared memory. " 
+            "No valid tile description is found in the generator.", True):
+            plan.epilogue_visitor = epilogue_visitor
+    
+    def test_not_ssa(self):
+        """
+        Test when the epilogue is not in SSA
+        """
+        def evt_redefine(accum, C, alpha):
+            F = accum + C
+            F = F * alpha
+            D = F
+            return D, F
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
+            cutlass.epilogue.trace(evt_redefine, example_tensors)
+
+        def evt_undefine(accum, alpha):
+            F = accum + C
+            D = F * alpha
+            return D, F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
+            cutlass.epilogue.trace(evt_undefine, example_tensors)
+    
+    def test_missing_example_tensor(self):
+        """
+        Test when the example tensor of an input/output variable is not provided
+        """
+        def evt_missing_example_tensor(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+    def test_return_expression(self):
+        """
+        Test when the return value is an expression
+        """
+        def evt_return_expr(accum, C):
+            return accum + C
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
+            cutlass.epilogue.trace(evt_return_expr, example_tensors)
+    
+    def test_incompatible_shape(self):
+        """
+        Test when the shape of example tensors are incompatible
+        """
+        def evt_incompatible_shape(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 256, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, 
+            "RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
+            cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
+    
+    def test_no_matching_impl(self):
+        def evt_no_matching_impl(accum, bias):
+            D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
+            return D
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 256)),
+            "bias": self.fake_tensor(np.float16, (16, 32)),
+            "D": self.fake_tensor(np.float16, (6, 512, 256))
+        }
+        
+        with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
+            cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
+    #
+    # Helper functions
+    #
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/interface/gemm_interface.py
+++ b/test/python/cutlass/interface/gemm_interface.py
@ -0,0 +1,351 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests the high-level GEMM interface
+"""
+
+from math import ceil
+import unittest
+
+import cutlass
+import cutlass.utils.datatypes as datatypes
+from cutlass.backend.utils.device import device_cc
+from utils import ExpectException
+
+
+class GemmEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Gemm interface
+    """
+    def __init__(self, element_A, element_B, element_C, element_D, element_accumulator,
+                 layout_A, layout_B, layout_C, alignment_A, alignment_B, alignment_C):
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+        self.plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B, element_C=element_C,
+                                    element_D=element_D, element_accumulator=element_accumulator,
+                                    layout_A=layout_A, layout_B=layout_B, layout_C=layout_C)
+        self.op = self.plan.construct(alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+
+        :param other_plan: plan to compare against the default GEMM
+        :type other_plan: cutlass.op.Gemm
+
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(alignment_A=self.alignment_A, alignment_B=self.alignment_B, alignment_C=self.alignment_C)
+
+        # Compare whether the operations are equal by comparing the C++ code that would be emitted for them
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using CUTLASS data types
+        and layouts for constructing the Gemm interface
+        """
+        if not datatypes.numpy_available:
+            return
+
+        # Test when specifying all parameters
+        plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_A=self.layout_A, layout_B=self.layout_B, layout_C=self.layout_C)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A
+        plan_other = cutlass.op.Gemm(element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_B=self.layout_B, layout_C=self.layout_C,
+                                  element=self.element_A, layout=self.layout_A)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if self.element_A == self.element_B and self.layout_A == self.layout_B:
+            plan_other = cutlass.op.Gemm(element_C=self.element_C, element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                      layout_C=self.layout_C, element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                      element_D=self.element_D, layout_A=self.layout_A, layout_B=self.layout_B,
+                                      layout_C=self.layout_C)
+            assert self._plans_equal(plan_other)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_other = cutlass.op.Gemm(element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using numpy as a frontend
+        """
+        if not datatypes.numpy_available:
+            return
+
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+
+        layout_to_order = {
+            cutlass.LayoutType.RowMajor: 'C',
+            cutlass.LayoutType.ColumnMajor: 'F'
+        }
+        size = (2, 2)
+        A = np.zeros(size, order=layout_to_order[self.layout_A], dtype=type_A)
+        B = np.zeros(size, order=layout_to_order[self.layout_B], dtype=type_B)
+        C = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_C)
+        D = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_D)
+
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass.op.Gemm(B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A, layout_A=self.layout_A)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if type_A == type_B and self.layout_A == self.layout_B:
+            plan_np = cutlass.op.Gemm(C=C, D=D, element_accumulator=type_accum, element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_np = cutlass.op.Gemm(element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+
+
+class GemmEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Gemm interface
+    """
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
+                layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f32_ntn_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32,
+                layout_A=cutlass.LayoutType.ColumnMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.ColumnMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_4_4_4(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
+                element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
+                layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for F64 Tensor Core tests.")
+    def test_gemm_equivalence_f64_f64_f64_f64_f64_tnt_1_1_1(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass.DataType.f64, element_B=cutlass.DataType.f64, element_C=cutlass.DataType.f64,
+                element_D=cutlass.DataType.f64, element_accumulator=cutlass.DataType.f64,
+                layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor, layout_C=cutlass.LayoutType.RowMajor,
+                alignment_A=1, alignment_B=1, alignment_C=1)
+        gemm_eq.test_all()
+
+
+class GemmErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass.op.Gemm(element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
+
+        with ExpectException(True, 'Alignment 16 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=16, alignment_B=16, alignment_C=16)
+
+    def test_tensorop_availability(self):
+        """
+        Tests case in which only SIMT operations are available but TensorOp is requested
+        """
+        cc = device_cc()
+
+        # F64 Tensor Core operations are only avaiable on devices with CC >= 80
+        supports_tensorop_f64 = cc >= 80
+        plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f64, layout=cutlass.LayoutType.RowMajor)
+
+        error_msg = f'Incorrectly raised an exception for availability of TensorOp with F64 operands on SM{cc}'
+        with ExpectException(not supports_tensorop_f64, error_msg):
+            plan.opclass = cutlass.OpcodeClass.TensorOp
+
+        expected_opclass = cutlass.OpcodeClass.TensorOp if supports_tensorop_f64 else cutlass.OpcodeClass.Simt
+        assert plan.opclass == expected_opclass, f'Expected opclass to be {expected_opclass}, but received {plan.opclass} for SM{cc}'
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for F16 Tensor Core tests.")
+    def test_opclass_switch(self):
+        """
+        Tests cases in which the opcode class in question is switched (e.g., from TensorOp to SIMT)
+        """
+        plan = cutlass.op.Gemm( element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
+        assert plan.opclass == cutlass.OpcodeClass.TensorOp
+
+        # Ensure that all tile descriptions have opclass of TensorOp
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp
+
+        plan.opclass = cutlass.OpcodeClass.Simt
+
+        # Ensure that all tile descriptions have opclass of Simt
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt
+
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        cc = device_cc()
+        plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
+        td = plan.tile_descriptions()[0]
+        stages = td.stages
+
+        # Zero stage count is valid for SM90+, as this is used to indicate that the builder's auto stage
+        # count should be used
+        with ExpectException(cc < 90, f'Requested zero stages'):
+            td.stages = 0
+            plan.construct(td)
+
+        if cc < 90:
+            with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
+                td.stages = 3
+                plan.construct(td)
+        else:
+            original_kschedule = td.kernel_schedule
+            original_eschedule = td.epilogue_schedule
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+                td.epilogue_schedule = cutlass.EpilogueScheduleType.NoSmemWarpSpecialized
+                td.stages = 3
+                plan.construct(td)
+
+            # Reset schedules
+            td.kernel_schedule = original_kschedule
+            td.epilogue_schedule = original_eschedule
+
+        with ExpectException(True, f'Requested too many stages'):
+            td.stages = 100
+            plan.construct(td)
+
+        # Reset stage count
+        td.stages = stages
+
+        cluster_shape = td.cluster_shape
+        with ExpectException(cc < 90, f'Requested non-unit cluster shape on SM{cc}'):
+            td.cluster_shape = [2, 1, 1]
+            plan.construct(td)
+
+        # Reset cluster shape
+        td.cluster_shape = cluster_shape
+
+        with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.ScheduleAuto
+            plan.construct(td)
+
+        with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
+            td.kernel_schedule = cutlass.KernelScheduleType.ScheduleAuto
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+            td.tile_scheduler = cutlass.TileSchedulerType.StreamK
+            plan.construct(td)
+
+        # Ensure that all returned tile descriptions are unique
+        ops = {}
+        for i, td in enumerate(plan.tile_descriptions()):
+            op = plan.construct(td)
+            code_str = op.rt_module.emit()
+            if code_str in ops:
+                conflicting_td = ops[code_str]
+                assert False, f'Multiple tile descriptions emitted {code_str}\nTile descriptions are:\n{td}\n{conflicting_td}'
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/interface/utils.py
+++ b/test/python/cutlass/interface/utils.py
@ -0,0 +1,69 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Helper functions & classes for interface test
+"""
+class ExpectException:
+    """
+    Utility class to assert that an exception was raised when expected
+
+    Example:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        with ExceptionExpected(True, 'Division by zero'):
+            x = 1.0 / 0.0
+
+    :param exception_expected: whether an exception is expected to be raised
+    :type exception_expected: bool
+    :param message: message to print if an exception is raised when not expected or vice versa
+    :type message: str
+    """
+    def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
+        self.exception_expected = exception_expected
+        self.message = message
+        self.verify_msg = verify_msg
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        exception_raised = exc_type is not None
+        assert self.exception_expected == exception_raised, self.message
+        if self.verify_msg:
+            exc_message = f"{exc_type.__name__}: {exc_val}"
+            assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
+
+        # Suppress the exception
+        return True