CUTLASS 3.2 (#1024)

* CUTLASS 3.2
2023-08-07 14:50:32 -10:00
parent a0d787b746
commit 4575443d44
392 changed files with 47559 additions and 7940 deletions
--- a/test/python/conv2d/conv2d_sm80.py
+++ b/test/python/conv2d/conv2d_sm80.py
@ -0,0 +1,138 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for Conv2d operands on SM80
+"""
+from conv2d_test_utils import *
+import cutlass
+import logging
+
+
+cutlass.set_log_level(logging.WARNING)
+cc = 80
+
+@unittest.skipIf(device_cc() != cc, 'Device compute capability is invalid for SM80 tests.')
+class Conv2dSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+conv_problems = get_conv_problems()
+
+# Tests for optimized & analytic
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    # F16, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
+        opclass="simt", threadblock_shape=[128, 128, 8], 
+        warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
+    # F16, tensor op
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
+        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, analytic iterator
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, 
+        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
+    # F16, tensor op, f32 output
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, 
+        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, different tile description
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
+        opclass="tensor_op", threadblock_shape=[128, 64, 32], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
+    # F32, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, 
+        opclass="simt", threadblock_shape=[128, 128, 8], 
+        warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
+    # Tf32, tensorop
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 16],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
+    )
+    # Split-K
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
+        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
+        split_k_slices=2)
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
+        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
+        split_k_slices=5)
+    # Swizzling functor
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
+        opclass="tensor_op", threadblock_shape=[128, 64, 32], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
+
+# Tests for few channels and fixed channels
+# F16, tensor op, few channels
+for c, tb, stage, inst in zip([2, 1],
+                                [[128, 128, 64], [128, 128, 32]],
+                                [3, 2],
+                                [[16, 8, 16], [16, 8, 8]]):
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=tb, 
+        warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
+    )
+# F16, tensor op, fixed channels
+for c in [8, 4, 2]:
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
+    )
+
+# Test activations
+for activation in ["relu", "leaky_relu"]:
+    for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
+        add_test(
+            Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
+            opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+            warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
+            split_k_slices=split_k_slices, activation=activation)
+    
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/conv2d/conv2d_test_utils.py
+++ b/test/python/conv2d/conv2d_test_utils.py
@ -0,0 +1,508 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Util Functions for Conv2d Test
+"""
+import torch
+import cutlass
+import unittest
+import cutlass_bindings
+from cutlass.utils.datatypes import binding_type, binding_opclass
+from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
+from cutlass.backend.utils.device import device_cc
+from cutlass.backend.test.utils import get_name_conv2d
+import numpy as np
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
+            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
+            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
+            cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+torch_dtype = {
+    cutlass.DataType.f16: torch.float16,
+    cutlass.DataType.f32: torch.float32,
+    cutlass.DataType.f64: torch.float64
+}
+
+numpy_dtype = {
+    cutlass.DataType.f16: np.float16,
+    cutlass.DataType.f32: np.float32,
+    cutlass.DataType.f64: np.float64
+}
+
+
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+
+
+# Override the backend launcher
+class Conv2dLauncherFrontend(Conv2dLauncher):
+    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+        
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        
+        self.layout_A = cutlass_bindings.TensorNHWC
+        self.layout_B = cutlass_bindings.TensorNHWC
+        self.layout_C = cutlass_bindings.TensorNHWC
+        self.layout_D = cutlass_bindings.TensorNHWC
+        
+        self.element_compute = cutlass_bindings.float32
+        self.enable_cached_results = True
+        
+        # Get randomization_max
+        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+            if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+                self.randomization_max = 2
+            else:
+                self.randomization_max = 3
+        else:
+            self.randomization_max = 7
+            
+        self.activation = plan.activation
+        
+        self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
+            
+    
+    def set_seed(self):
+        if self.backend == "numpy":
+            np.random.seed(self.seed)
+        else:
+            torch.manual_seed(self.seed)
+    
+    def uniform_init(self, size, dtype):
+        if self.backend == "numpy":
+            return super().uniform_init(size, numpy_dtype[dtype])
+        else:
+            tensor = torch.ceil(
+                torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
+            ).to(memory_format=torch.channels_last)
+            return tensor
+    
+    def zeros_like(self, tensor):
+        if self.backend == "numpy":
+            return np.zeros_like(tensor)
+        else:
+            return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
+    
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.backend == "numpy":
+            numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
+            return numpy_result
+        else:
+            if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
+                torch_result = alpha * torch.ops.aten.conv2d(
+                    A,
+                    B,
+                    stride=(ps.stride_h, ps.stride_w),
+                    padding=(ps.pad_h, ps.pad_w),
+                    dilation=(ps.dilation_h, ps.dilation_w)
+                ) + beta * C
+            elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+                torch_result = alpha * torch.nn.grad.conv2d_input(
+                    (ps.N, ps.C, ps.H, ps.W),
+                    B,
+                    A,
+                    padding=(ps.pad_h, ps.pad_w),
+                    stride=(ps.stride_h, ps.stride_w)
+                ) + beta * C
+            elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
+                torch_result = alpha * torch.nn.grad.conv2d_weight(
+                    B,
+                    (ps.K, ps.C, ps.R, ps.S),
+                    A,
+                    padding=(ps.pad_h, ps.pad_w),
+                    stride=(ps.stride_h, ps.stride_w)
+                ) + beta * C
+            else:
+                raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+            
+            if activation == cutlass.backend.epilogue.relu:
+                torch_result = torch.nn.functional.relu(torch_result)
+            elif activation == cutlass.backend.epilogue.leaky_relu:
+                torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+            
+            return torch_result
+    
+    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
+        if self.element_compute == cutlass_bindings.float16:
+            alpha = cutlass_bindings.float16(alpha)
+            beta = cutlass_bindings.float16(beta)
+        elif self.element_compute == cutlass_bindings.int32:
+            alpha = int(alpha)
+            beta = int(beta)
+        else:
+            alpha = alpha
+            beta = beta
+
+        # If cached result is loaded
+        cached_result_loaded = False
+
+        if self.enable_cached_results:
+            # Get problem key
+            cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
+                self.conv_kind,
+                problem_size,
+                alpha,
+                beta,
+                getTensorView(
+                    tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
+                ),
+                getTensorView(
+                    tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
+                ),
+                getTensorView(
+                    tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
+                ),
+            )
+            
+            cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
+
+            cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
+
+            conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
+                self.operation.arch,
+                self.seed,
+            )
+
+            cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
+                conv2d_result_cache_name
+            )
+            # CachedTestResultListing cached_results(conv2d_result_cache_name);
+            cached = cached_results.find(cached_test_key)
+            cached_result_loaded = cached[0]
+            if cached_result_loaded:
+                cached_test_result = cached[1]
+
+        if not cached_result_loaded:
+            # Compute the conv2d on host
+            tensor_D_ref = np.ones_like(tensor_C)
+            tensor_ref_A = getTensorRef(
+                tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
+            )
+            tensor_ref_B = getTensorRef(
+                tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
+            )
+            tensor_ref_C = getTensorRef(
+                tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
+            )
+            tensor_ref_D_ref = getTensorRef(
+                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
+            )
+
+            self.host_conv2d(
+                self.conv_kind,
+                problem_size,
+                tensor_ref_A,
+                tensor_ref_B,
+                tensor_ref_C,
+                tensor_ref_D_ref,
+                alpha,
+                beta,
+            )
+            
+            if activation == cutlass.backend.epilogue.leaky_relu:
+                tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
+            else:
+                tensor_D_ref = activation.numpy(tensor_D_ref)
+
+            tensor_view_D_ref = getTensorView(
+                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
+            )
+
+            if self.enable_cached_results:
+                cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
+                    tensor_view_D_ref
+                )
+                cached_results = (
+                    cutlass_bindings.test.conv.host.CachedTestResultListing(
+                        conv2d_result_cache_name
+                    )
+                )
+                cached_results.append(cached_test_key, cached_test_result)
+                cached_results.write(conv2d_result_cache_name)
+            else:
+                return tensor_D_ref
+
+        return cached_test_result.D
+    
+    def equal(self, tensor_D, tensor_D_ref, problem_size):
+        if self.backend == "numpy":
+            return super().equal(tensor_D, tensor_D_ref, problem_size)
+        else:
+            torch.cuda.synchronize()
+            return torch.equal(tensor_D, tensor_D_ref)
+                
+    
+    def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        
+        #
+        # Initialize input and output tensors
+        #
+        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
+            if self.backend == "torch":
+                tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+                tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+            else:
+                tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
+                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
+                tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
+        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+            if self.backend == "torch":
+                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+                tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+            else:
+                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
+                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
+                tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
+        elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
+            if self.backend == "torch":
+                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+                tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+                tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+            else:
+                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
+                tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
+                tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+
+        self.set_seed()
+
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = self.zeros_like(tensor_C)
+        
+        self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D, 
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+        
+        tensor_D_ref = self.reference(
+            ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
+        )
+        
+        return self.equal(tensor_D, tensor_D_ref, ps)
+
+
+def add_test(
+    cls, 
+    cc, 
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+    
+    def run(self):
+        # Create the plan
+        plan = cutlass.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+        
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+        
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass.epilogue, activation)
+        
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
+        
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices): continue
+            
+            self.assertTrue(
+                conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
+            )
+    
+    setattr(cls, test_name, run)
+    
+    return run
+
+
+def get_conv_problems():  
+    # 64: minimum channel size
+    conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
+            cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
+            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
+            cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
+            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
+            cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
+            cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+    
+    return conv_problems
--- a/test/python/conv2d/run_all_tests.py
+++ b/test/python/conv2d/run_all_tests.py
@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    tests = loader.discover('./', 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
--- a/test/python/emit/pytorch.py
+++ b/test/python/emit/pytorch.py
@ -39,6 +39,7 @@ import tempfile
 import unittest

 import cutlass
+import cutlass_bindings

 if cutlass.utils.datatypes.torch_available:
    import torch
@ -85,6 +86,34 @@ def _generate_problems(dtype, num):
        Ds.append(D)
    return As, Bs, Cs, Ds

+def _generate_conv2d_problem(conv_kind, dtype, ps):
+    """
+    Utility function to generate conv2d inputs
+    
+    :param conv_kind: kind of convolution
+    :type conv_kind: str
+    :param dtype: data type of tensors
+    :param problem_size: the conv2d problem size
+    :type problem_size: cutlass_bindings.conv.Conv2dProblemSize
+
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    if conv_kind == "fprop":
+        tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+    elif conv_kind == "dgrad":
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+    else:
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+    sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
+    return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
+

@unittest.skipIf(not cutlass.utils.datatypes.torch_available, 'PyTorch must be available to run PyTorch extension tests')
 class PyTorchExtensionTest(unittest.TestCase):
@ -155,6 +184,127 @@ class PyTorchExtensionTest(unittest.TestCase):
        Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
        Ds = mod.run(As, Bs, Cs, alpha, beta)
        check_all(Ds, Ds_ref)
+    
+    def test_conv2d_fprop(self):
+        torch.manual_seed(2023)
+        
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
+        plan.activation = "relu"
+        
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        
+        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
+            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
+            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        )
+        
+        A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        
+        D_ref = alpha * torch.ops.aten.conv2d(
+            A, B, stride=stride, padding=padding
+        ) + beta * C
+        D_ref = torch.nn.functional.relu(D_ref)
+        D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
+        
+        assert torch.allclose(D, D_ref) 
+        
+        # Test serial split-K
+        D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+        
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+        
+    
+    def test_conv2d_dgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
+        
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        
+        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
+            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
+            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        )
+        
+        A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
+        D_ref = alpha * torch.nn.grad.conv2d_input(
+            input_size, B, A, 
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
+        
+        assert torch.allclose(D, D_ref) 
+    
+    def test_conv2d_wgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
+        
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        
+        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
+            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
+            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        )
+        
+        A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
+        D_ref = alpha * torch.nn.grad.conv2d_weight(
+            B, weight_size, A, 
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
+        
+        assert torch.allclose(D, D_ref) 
+        
+        # Test serial split-K
+        D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+        
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)


 if __name__ == '__main__':
--- a/test/python/gemm/gemm_f16_sm80.py
+++ b/test/python/gemm/gemm_f16_sm80.py
@ -37,82 +37,16 @@ Low-level functionality tests for GEMM with F16 operands on SM80
 from functools import partial

 import cutlass
-from cutlass.utils.datatypes import binding_opclass, binding_type
-from cutlass.backend.test.gemm_testbed import test_all_gemm
+import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, get_name
+from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
 from cutlass.backend.utils.device import device_cc

+
+cutlass.set_log_level(logging.WARNING)
 cc = 80

-# Partial specialziation for naming tests
-bound_type = binding_type(cutlass.DataType.f16)
-name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator,
-             threadblock_shape, warp_count, stages, opclass, swizzle=None):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: layouts of A, B, and C operands
-    :type layouts: list or tuple
-    :param alignments: alingments of A, B, and C operands
-    :type alignments: list or tuple
-    :param element_output: data type of the output element
-    :type element_output: cutlass.DataType
-    :param element_accumulator: data type used in accumulation
-    :type element_accumulator: cutlass.DataType
-    :param threadblock_shape: dimensions of threadblock tiles
-    :type threadblock_shape: list or tuple
-    :param warp_count: warps to be launched per threadblock dimension
-    :type warp_count: list or tuple
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param swizzle: threadblock swizzling functor
-    """
-    cluster_shape = [1, 1, 1]
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass.DataType.f16
-        element_B = cutlass.DataType.f16
-        layout_A, layout_B, layout_C = layouts
-        alignment_A, alignment_B, alignment_C = alignments
-
-        plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
-                               element_C=element_output, element_D=element_output,
-                               layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
-                               element_accumulator=element_accumulator,
-                               kernel_cc=cc)
-
-        plan.opclass = opclass
-        if swizzle is not None:
-            plan.swizzling_functor = swizzle
-        td = plan.tile_descriptions()[0]
-        td.threadblock_shape = threadblock_shape
-        td.stages = stages
-        td.warp_count = warp_count
-        td.cluster_shape = cluster_shape
-        op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
-        self.assertTrue(test_all_gemm(op, 'universal'))
-
-    element_epilogue = element_accumulator
-    name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
-                   binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
-    setattr(cls, name, run)
-
-    return run
-
-
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
 class GemmF16Sm80(unittest.TestCase):
    """
@ -128,40 +62,64 @@ class GemmF16Sm80StreamK(unittest.TestCase):
    """
    pass

+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f16, cc=cc, cluster_shape=[1, 1, 1])

 # Tests using TensorOp
-add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)

-add_test_tensorop(GemmF16Sm80, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.NNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.NTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.NTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 128, 32], [1, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 64, 32], [2, 1, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 5)
-add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [2, 2, 2], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 32], warp_count=[2, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
+                  element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)

 # Tests using SIMT
-add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)

-add_test_simt(GemmF16Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
-add_test_simt(GemmF16Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [64, 128, 8], [1, 2, 1], 2)
-add_test_simt(GemmF16Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [128, 64, 8], [2, 1, 1], 2)
-add_test_simt(GemmF16Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 8], [1, 1, 1], 2)
-add_test_simt(GemmF16Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 8], [2, 2, 1], 2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
+              element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)

 # Stream K tests
-add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
-add_test_streamk(GemmF16Sm80StreamK, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_streamk(GemmF16Sm80StreamK, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 5)
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)

 if __name__ == '__main__':
    unittest.main()
--- a/test/python/gemm/gemm_f16_sm90.py
+++ b/test/python/gemm/gemm_f16_sm90.py
@ -37,87 +37,16 @@ Low-level functionality tests for GEMM with F16 operands on SM90
 from functools import partial

 import cutlass
-from cutlass.utils.datatypes import binding_opclass, binding_type
-from cutlass.backend.test.gemm_testbed import test_all_gemm
+import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, get_name
+from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
 from cutlass.backend.utils.device import device_cc

+
+cutlass.set_log_level(logging.WARNING)
 cc = 90

-# Partial specialziation for naming tests
-bound_type = binding_type(cutlass.DataType.f16)
-name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator,
-             cluster_shape, threadblock_shape, stages, opclass,
-             kernel_schedule=cutlass.KernelScheduleType.ScheduleAuto,
-             swizzle=None):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: layouts of A, B, and C operands
-    :type layouts: list or tuple
-    :param alignments: alingments of A, B, and C operands
-    :type alignments: list or tuple
-    :param element_output: data type of the output element
-    :type element_output: cutlass.DataType
-    :param element_accumulator: data type used in accumulation
-    :type element_accumulator: cutlass.DataType
-    :param cluster_shape: dimensions of threadblock cluster
-    :type cluster_shape: list or tuple
-    :param threadblock_shape: dimensions of threadblock tiles
-    :type threadblock_shape: list or tuple
-    :param warp_count: warps to be launched per threadblock dimension
-    :type warp_count: list or tuple
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param kernel_schedule: kernel schedule type
-    :type kernel_schedule: cutlass.KernelScheduleType
-    :param swizzle: threadblock swizzling functor
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass.DataType.f16
-        element_B = cutlass.DataType.f16
-        layout_A, layout_B, layout_C = layouts
-        alignment_A, alignment_B, alignment_C = alignments
-
-        plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
-                               element_C=element_output, element_D=element_output,
-                               layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
-                               element_accumulator=element_accumulator)
-
-        plan.opclass = opclass
-        if swizzle is not None:
-            plan.swizzling_functor = swizzle
-        td = plan.tile_descriptions()[0]
-        td.threadblock_shape = threadblock_shape
-        td.stages = stages
-        td.cluster_shape = cluster_shape
-        td.kernel_schedule = kernel_schedule
-        op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
-        self.assertTrue(test_all_gemm(op, 'universal'))
-
-    element_epilogue = element_accumulator
-    name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
-                   binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
-                   opclass=binding_opclass(opclass), kernel_schedule=kernel_schedule)
-    setattr(cls, name, run)
-
-    return run
-
-
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
 class GemmF16Sm90(unittest.TestCase):
    """
@ -126,47 +55,85 @@ class GemmF16Sm90(unittest.TestCase):
    pass


-add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
+add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=cutlass.DataType.f16,
+                               warp_count=None, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)

 # Tests with 1x1x1 clusters
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], 3)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 64, 64], 5)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [2, 2, 2], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
+add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
+add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
+add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 64], stages=5)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
+                      element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)

 # Tests with different cluster shapes
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [1, 4, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 4, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [4, 1, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [4, 2, 1], [64, 128, 64], None)
+add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
+                       element_accumulator=cutlass.DataType.f16, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 1, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
+                       element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 2, 1])

 # Tests for different schedule modes
-add_test_schedule = partial(add_test, GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, opclass=cutlass.OpcodeClass.TensorOp)
-add_test_schedule([1, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
-add_test_schedule([1, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
-add_test_schedule([2, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
-add_test_schedule([2, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
-add_test_schedule([2, 1, 1], [256, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
-add_test_schedule([2, 1, 1], [128, 128, 64], 5, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
-add_test_schedule([2, 1, 1], [128, 128, 64], 5, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
+add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
+                            element_output=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
+                            opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)

 # Tests using SIMT
-add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
-add_test_simt(GemmF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 128, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 64, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 64, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 8], 2)
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
+add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8])
+add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8])
+add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8])
+add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8])
+add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8])


 if __name__ == '__main__':
--- a/test/python/gemm/gemm_f32_sm80.py
+++ b/test/python/gemm/gemm_f32_sm80.py
@ -37,83 +37,16 @@ Low-level functionality tests for GEMM with F32 operands on SM80
 from functools import partial

 import cutlass
-from cutlass.utils.datatypes import binding_opclass, binding_type
-from cutlass.backend.test.gemm_testbed import test_all_gemm
+import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, get_name
+from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
 from cutlass.backend.utils.device import device_cc

+
+cutlass.set_log_level(logging.WARNING)
 cc = 80

-# Partial specialziation for naming tests
-bound_type = binding_type(cutlass.DataType.f32)
-name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator,
-             threadblock_shape, warp_count, stages, opclass, swizzle=None):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: layouts of A, B, and C operands
-    :type layouts: list or tuple
-    :param alignments: alingments of A, B, and C operands
-    :type alignments: list or tuple
-    :param element_output: data type of the output element
-    :type element_output: cutlass.DataType
-    :param element_accumulator: data type used in accumulation
-    :type element_accumulator: cutlass.DataType
-    :param threadblock_shape: dimensions of threadblock tiles
-    :type threadblock_shape: list or tuple
-    :param warp_count: warps to be launched per threadblock dimension
-    :type warp_count: list or tuple
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param swizzle: threadblock swizzling functor
-    """
-
-    cluster_shape = [1, 1, 1]
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass.DataType.f32
-        element_B = cutlass.DataType.f32
-        layout_A, layout_B, layout_C = layouts
-        alignment_A, alignment_B, alignment_C = alignments
-
-        plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
-                               element_C=element_output, element_D=element_output,
-                               layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
-                               element_accumulator=element_accumulator,
-                               kernel_cc=cc)
-
-        plan.opclass = opclass
-        if swizzle is not None:
-            plan.swizzling_functor = swizzle
-        td = plan.tile_descriptions()[0]
-        td.threadblock_shape = threadblock_shape
-        td.stages = stages
-        td.warp_count = warp_count
-        td.cluster_shape = cluster_shape
-        op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
-
-        self.assertTrue(test_all_gemm(op, 'universal'))
-
-    element_epilogue = element_accumulator
-    name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
-                   binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
-    setattr(cls, name, run)
-
-    return run
-
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
 class GemmF32Sm80(unittest.TestCase):
    """
@ -130,25 +63,37 @@ class GemmF32Sm80StreamK(unittest.TestCase):
    pass


+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f32, cc=cc, cluster_shape=[1, 1, 1])
+
 # Tests using TensorOp
-add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)

-add_test_tensorop(GemmF32Sm80, LayoutCombination.NNN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF32Sm80, LayoutCombination.NNT, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
-add_test_tensorop(GemmF32Sm80, LayoutCombination.NTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [64, 128, 32], [1, 2, 1], 3)
-add_test_tensorop(GemmF32Sm80, LayoutCombination.NTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [64, 64, 32], [1, 1, 1], 4)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                  element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 32], warp_count=[1, 1, 1], stages=4)
 # Tests using SIMT
-add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)

-add_test_simt(GemmF32Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
-add_test_simt(GemmF32Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [64, 128, 8], [1, 2, 1], 2)
-add_test_simt(GemmF32Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 64, 8], [2, 1, 1], 2)
-add_test_simt(GemmF32Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [64, 64, 8], [1, 1, 1], 2)
-add_test_simt(GemmF32Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
+              element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)

 # Stream K tests
-add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
-add_test_streamk(GemmF32Sm80StreamK, LayoutCombination.TTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
+                 element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)


 if __name__ == '__main__':
--- a/test/python/gemm/gemm_f64_sm80.py
+++ b/test/python/gemm/gemm_f64_sm80.py
@ -37,84 +37,16 @@ Low-level functionality tests for GEMM with F64 operands on SM80
 from functools import partial

 import cutlass
-from cutlass.utils.datatypes import binding_opclass, binding_type
-from cutlass.backend.test.gemm_testbed import test_all_gemm
+import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, get_name
+from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
 from cutlass.backend.utils.device import device_cc

+
+cutlass.set_log_level(logging.WARNING)
 cc = 80

-# Partial specialziation for naming tests
-bound_type = binding_type(cutlass.DataType.f64)
-name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator,
-             threadblock_shape, warp_count, stages, opclass, swizzle=None):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: layouts of A, B, and C operands
-    :type layouts: list or tuple
-    :param alignments: alingments of A, B, and C operands
-    :type alignments: list or tuple
-    :param element_output: data type of the output element
-    :type element_output: cutlass.DataType
-    :param element_accumulator: data type used in accumulation
-    :type element_accumulator: cutlass.DataType
-    :param threadblock_shape: dimensions of threadblock tiles
-    :type threadblock_shape: list or tuple
-    :param warp_count: warps to be launched per threadblock dimension
-    :type warp_count: list or tuple
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param swizzle: threadblock swizzling functor
-    """
-
-    cluster_shape = [1, 1, 1]
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass.DataType.f64
-        element_B = cutlass.DataType.f64
-        layout_A, layout_B, layout_C = layouts
-        alignment_A, alignment_B, alignment_C = alignments
-
-        plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
-                               element_C=element_output, element_D=element_output,
-                               layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
-                               element_accumulator=element_accumulator,
-                               kernel_cc=cc)
-
-        plan.opclass = opclass
-        if swizzle is not None:
-            plan.swizzling_functor = swizzle
-        td = plan.tile_descriptions()[0]
-        td.threadblock_shape = threadblock_shape
-        td.stages = stages
-        td.warp_count = warp_count
-        td.cluster_shape = cluster_shape
-        op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
-
-        self.assertTrue(test_all_gemm(op, 'universal'))
-
-    element_epilogue = element_accumulator
-    name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
-                   binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
-    setattr(cls, name, run)
-
-    return run
-
-
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
 class GemmF64Sm80(unittest.TestCase):
    """
@ -131,25 +63,36 @@ class GemmF64Sm80StreamK(unittest.TestCase):
    pass


-# Tests using TensorOp
-add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f64, cc=cc, cluster_shape=[1, 1, 1])

-add_test_tensorop(GemmF64Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 16], [4, 2, 1], 3)
-add_test_tensorop(GemmF64Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 64, 16], [2, 2, 1], 4)
-add_test_tensorop(GemmF64Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [32, 32, 16], [2, 1, 1], 5)
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                  element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                  element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64,  64, 16], warp_count=[2, 2, 1], stages=4)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                  element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 32,  32, 16], warp_count=[2, 1, 1], stages=5)

 # Tests using SIMT
-add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)

-add_test_simt(GemmF64Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 8], [2, 2, 1], 2)
-add_test_simt(GemmF64Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 128, 8], [1, 2, 1], 2)
-add_test_simt(GemmF64Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 64, 8], [2, 1, 1], 2)
-add_test_simt(GemmF64Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 64, 8], [1, 1, 1], 2)
-add_test_simt(GemmF64Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 8], [2, 2, 1], 2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+              element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)

 # Stream K tests
-add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
-add_test_streamk(GemmF64Sm80StreamK, LayoutCombination.NTT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 16], [4, 2, 1], 3)
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
+                 element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)


 if __name__ == '__main__':
--- a/test/python/gemm/gemm_f64_sm90.py
+++ b/test/python/gemm/gemm_f64_sm90.py
@ -37,90 +37,16 @@ Low-level functionality tests for GEMM with F64 operands on SM90
 from functools import partial

 import cutlass
-from cutlass.utils.datatypes import binding_opclass, binding_type
-from cutlass.backend.test.gemm_testbed import test_all_gemm
+import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, get_name
+from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
 from cutlass.backend.utils.device import device_cc

+
+cutlass.set_log_level(logging.WARNING)
 cc = 90

-# Partial specialziation for naming tests
-bound_type = binding_type(cutlass.DataType.f64)
-name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False, swizzle=None):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: layouts of A, B, and C operands
-    :type layouts: list or tuple
-    :param alignments: alingments of A, B, and C operands
-    :type alignments: list or tuple
-    :param element_output: data type of the output element
-    :type element_output: cutlass.DataType
-    :param element_accumulator: data type used in accumulation
-    :type element_accumulator: cutlass.DataType
-    :param cluster_shape: dimensions of threadblock cluster
-    :type cluster_shape: list or tuple
-    :param threadblock_shape: dimensions of threadblock tiles
-    :type threadblock_shape: list or tuple
-    :param warp_count: warps to be launched per threadblock dimension
-    :type warp_count: list or tuple
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    :param swizzle: threadblock swizzling functor
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass.DataType.f64
-        element_B = cutlass.DataType.f64
-        layout_A, layout_B, layout_C = layouts
-        alignment_A, alignment_B, alignment_C = alignments
-
-        plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
-                               element_C=element_output, element_D=element_output,
-                               layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
-                               element_accumulator=element_accumulator)
-
-        plan.opclass = opclass
-        if swizzle is not None:
-            plan.swizzling_functor = swizzle
-        td = plan.tile_descriptions()[0]
-        td.threadblock_shape = threadblock_shape
-        td.stages = stages
-        td.cluster_shape = cluster_shape
-        td.persistent = persistent
-        op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
-        self.assertTrue(test_all_gemm(op, 'universal'))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    element_epilogue = element_accumulator
-    name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
-                   binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
-                   opclass=binding_opclass(opclass), suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
 class GemmF64Sm90(unittest.TestCase):
    """
@ -129,13 +55,14 @@ class GemmF64Sm90(unittest.TestCase):
    pass


-add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
-add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
+add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
+                               element=cutlass.DataType.f64, element_output=cutlass.DataType.f64,
+                               element_accumulator=cutlass.DataType.f64, compilation_modes=['nvcc'])

-add_test_tensorop(GemmF64Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 32], 3)
-add_test_tensorop(GemmF64Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 32], 3)
-add_test_simt(GemmF64Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 8], 2)
-add_test_simt(GemmF64Sm90, LayoutCombination.TTT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [64, 128, 8], 2)
+add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(    opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128,  8], stages=2)
+add_test_specialized(    opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128,  8], stages=2)


 if __name__ == '__main__':
--- a/test/python/gemm/gemm_s8_sm80.py
+++ b/test/python/gemm/gemm_s8_sm80.py
@ -37,84 +37,16 @@ Low-level functionality tests for GEMM with S8 operands on SM80
 from functools import partial

 import cutlass
-from cutlass.utils.datatypes import binding_opclass, binding_type
-from cutlass.backend.test.gemm_testbed import test_all_gemm
+import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, get_name
+from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
 from cutlass.backend.utils.device import device_cc

+
+cutlass.set_log_level(logging.WARNING)
 cc = 80

-# Partial specialziation for naming tests
-bound_type = binding_type(cutlass.DataType.s8)
-name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator,
-             threadblock_shape, warp_count, stages, opclass, swizzle=None):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: layouts of A, B, and C operands
-    :type layouts: list or tuple
-    :param alignments: alingments of A, B, and C operands
-    :type alignments: list or tuple
-    :param element_output: data type of the output element
-    :type element_output: cutlass.DataType
-    :param element_accumulator: data type used in accumulation
-    :type element_accumulator: cutlass.DataType
-    :param threadblock_shape: dimensions of threadblock tiles
-    :type threadblock_shape: list or tuple
-    :param warp_count: warps to be launched per threadblock dimension
-    :type warp_count: list or tuple
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param swizzle: threadblock swizzling functor
-    """
-
-    cluster_shape = [1, 1, 1]
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass.DataType.s8
-        element_B = cutlass.DataType.s8
-        layout_A, layout_B, layout_C = layouts
-        alignment_A, alignment_B, alignment_C = alignments
-
-        plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
-                               element_C=element_output, element_D=element_output,
-                               layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
-                               element_accumulator=element_accumulator,
-                               kernel_cc=cc)
-
-        plan.opclass = opclass
-        if swizzle is not None:
-            plan.swizzling_functor = swizzle
-        td = plan.tile_descriptions()[0]
-        td.threadblock_shape = threadblock_shape
-        td.stages = stages
-        td.warp_count = warp_count
-        td.cluster_shape = cluster_shape
-        op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
-
-        self.assertTrue(test_all_gemm(op, 'universal'))
-
-    element_epilogue = element_accumulator
-    name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
-                   binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
-    setattr(cls, name, run)
-
-    return run
-
-
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
 class GemmS8Sm80(unittest.TestCase):
    """
@ -131,25 +63,36 @@ class GemmS8Sm80StreamK(unittest.TestCase):
    pass


-# Tests using TensorOp
-add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
+add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.s8, cc=cc, cluster_shape=[1, 1, 1])

-add_test_tensorop(GemmS8Sm80, LayoutCombination.TNN, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [256, 128, 64], [4, 2, 1], 3)
-add_test_tensorop(GemmS8Sm80, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [128, 256, 64], [2, 4, 1], 3)
-add_test_tensorop(GemmS8Sm80, LayoutCombination.TNN, [16, 16, 4], cutlass.DataType.s32, cutlass.DataType.s32, [64, 64, 64], [1, 1, 1], 4)
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16],  element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16],  element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16,  4], element_output=cutlass.DataType.s32,
+                  element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=4)

 # Tests using SIMT
-add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)

-add_test_simt(GemmS8Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [128, 128, 8], [2, 2, 1], 2)
-add_test_simt(GemmS8Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [64, 128, 8], [1, 2, 1], 2)
-add_test_simt(GemmS8Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [128, 64, 8], [2, 1, 1], 2)
-add_test_simt(GemmS8Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.s32, cutlass.DataType.s32, [64, 64, 8], [1, 1, 1], 2)
-add_test_simt(GemmS8Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.s32, cutlass.DataType.s32, [128, 128, 8], [2, 2, 1], 2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1],  element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
+              element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)

 # Stream K tests
-add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
-add_test_streamk(GemmS8Sm80StreamK, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [128, 256, 64], [2, 4, 1], 3)
+add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                 element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)


 if __name__ == '__main__':
--- a/test/python/gemm/gemm_s8_sm90.py
+++ b/test/python/gemm/gemm_s8_sm90.py
@ -37,90 +37,16 @@ Low-level functionality tests for GEMM with S8 operands on SM90
 from functools import partial

 import cutlass
-from cutlass.utils.datatypes import binding_opclass, binding_type
-from cutlass.backend.test.gemm_testbed import test_all_gemm
+import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, get_name
+from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
 from cutlass.backend.utils.device import device_cc

+
+cutlass.set_log_level(logging.WARNING)
 cc = 90

-# Partial specialziation for naming tests
-bound_type = binding_type(cutlass.DataType.s8)
-name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False, swizzle=None):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: layouts of A, B, and C operands
-    :type layouts: list or tuple
-    :param alignments: alingments of A, B, and C operands
-    :type alignments: list or tuple
-    :param element_output: data type of the output element
-    :type element_output: cutlass.DataType
-    :param element_accumulator: data type used in accumulation
-    :type element_accumulator: cutlass.DataType
-    :param cluster_shape: dimensions of threadblock cluster
-    :type cluster_shape: list or tuple
-    :param threadblock_shape: dimensions of threadblock tiles
-    :type threadblock_shape: list or tuple
-    :param warp_count: warps to be launched per threadblock dimension
-    :type warp_count: list or tuple
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    :param swizzle: threadblock swizzling functor
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass.DataType.s8
-        element_B = cutlass.DataType.s8
-        layout_A, layout_B, layout_C = layouts
-        alignment_A, alignment_B, alignment_C = alignments
-
-        plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
-                               element_C=element_output, element_D=element_output,
-                               layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
-                               element_accumulator=element_accumulator)
-
-        plan.opclass = opclass
-        if swizzle is not None:
-            plan.swizzling_functor = swizzle
-        td = plan.tile_descriptions()[0]
-        td.threadblock_shape = threadblock_shape
-        td.stages = stages
-        td.cluster_shape = cluster_shape
-        td.persistent = persistent
-        op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
-        self.assertTrue(test_all_gemm(op, 'universal'))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    element_epilogue = element_accumulator
-    name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
-                   binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
-                   opclass=binding_opclass(opclass), suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
 class GemmS8Sm90(unittest.TestCase):
    """
@ -129,26 +55,40 @@ class GemmS8Sm90(unittest.TestCase):
    pass


-add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
+add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=cutlass.DataType.s8, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)

 # Tests with 1x1x1 clusters
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNN, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], 3)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 8],  cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [64, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 64, 32], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [4, 4, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16,  8], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64,  128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128,  64,  32], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4,  4, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)

 # Tests with different cluster shapes
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [2, 2, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 4, 1], [128, 128, 128], None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)

-# Tests with persistent warp-specialized threadblocks
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [2, 1, 1], [128, 128, 128], None, persistent=True)
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
+                  element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)

 # Tests for SIMT
-add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
-add_test_simt(GemmS8Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [64, 32, 8], 2)
+add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
+              element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)


 if __name__ == '__main__':
--- a/test/python/interface/conv2d_interface.py
+++ b/test/python/interface/conv2d_interface.py
@ -0,0 +1,285 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests the high-level Conv2d interface
+"""
+
+from math import ceil
+import unittest
+
+import cutlass
+import cutlass_bindings
+import cutlass.utils.datatypes as datatypes
+from cutlass.backend.utils.device import device_cc
+from utils import ExpectException
+import os
+
+
+class Conv2dEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Conv2d interface
+    """
+    def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
+                 alignment_A, alignment_B, alignment_C):
+        
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+        
+        self.conv_kind = conv_kind
+        
+        self.plan = cutlass.op.Conv2d(
+            kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
+            element_D=element_D, element_accumulator=element_accumulator)
+        
+        self.op = self.plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B, 
+            alignment_C=self.alignment_C)
+    
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+        
+        :param other_plan: plan to compare against the default Conv2d
+        :type other_plan: cutlass.op.Conv2d
+
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B, 
+            alignment_C=self.alignment_C)
+        
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
+        and layouts for constructing the Conv2d interface
+        """
+        if not datatypes.numpy_available:
+            return
+        
+        # Test when specifying all parameters
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator)
+        assert self._plans_equal(plan_other)
+        
+        # Test when specifying all parameters but A
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+        
+        # Test when specifying all parameters but A and B as tensors using generic element and output
+        plan_other = cutlass.op.Conv2d(
+            kind=self.conv_kind,
+            element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+        
+        # Test without explicit accumulator. Only run if the type of C and the accumulator are equal
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass.op.Conv2d(
+                kind=self.conv_kind,
+                element_C=self.element_C,
+                element_D=self.element_D,
+                element=self.element_A)
+            assert self._plans_equal(plan_other)
+        
+        # Test with only the generic types. Only rune if the types of A, B, C, and D are the same
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator):
+            plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
+            assert self._plans_equal(plan_other)
+    
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
+        """
+        if not datatypes.numpy_available:
+            return
+
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+        
+        size = (2, 2)
+        A = np.zeros(size, dtype=type_A)
+        B = np.zeros(size, dtype=type_B)
+        C = np.zeros(size, dtype=type_C)
+        D = np.zeros(size, dtype=type_D)
+
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+    
+    def torch_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
+        """
+        if not datatypes.torch_available:
+            return
+        
+        import torch
+        type_A = datatypes.torch_type(self.element_A)
+        type_B = datatypes.torch_type(self.element_B)
+        type_C = datatypes.torch_type(self.element_C)
+        type_D = datatypes.torch_type(self.element_D)
+        type_accum = datatypes.torch_type(self.element_accumulator)
+        
+        size = (2, 2)
+        
+        A = torch.empty(size, dtype=type_A)
+        B = torch.empty(size, dtype=type_B)
+        C = torch.empty(size, dtype=type_C)
+        D = torch.empty(size, dtype=type_D)
+        
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+    
+    def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+        
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
+        assert self._plans_equal(plan_np)
+        
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        if type_A == type_B:
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
+            assert self._plans_equal(plan_np)
+        
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+        
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
+            plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
+            assert self._plans_equal(plan_np)
+
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+        self.torch_test()
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class ConvEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Conv2d interface
+    """
+    pass
+
+type2alignment = {
+    cutlass.DataType.f16: 8,
+    cutlass.DataType.f32: 4
+}
+
+def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
+    
+    test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
+    
+    def run(self):
+        conv2d_eq = Conv2dEquivalence(
+            conv_kind=conv_kind, 
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            element_accumulator=element_accumulator, 
+            alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
+            alignment_C=type2alignment[element_C]
+        )
+        conv2d_eq.test_all()
+    
+    setattr(ConvEquivalenceTest, test_name, run)
+
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    for types in [
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f16],
+        [cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32],
+        [cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32]
+    ]:
+        add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class Conv2dErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+    
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
+        
+        with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
+    
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
+        
+        td = plan.tile_descriptions()[0]
+        td.threadblock_shape=[17, 32, 5]
+        
+        plan.tile_description = td
+        with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
+            plan.compile()
+        # Clean up the error message
+        os.remove("./cutlass_python_compilation_device_error.txt")
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/interface/gemm_interface.py
+++ b/test/python/interface/gemm_interface.py
@ -41,6 +41,7 @@ import cutlass
 import cutlass_bindings
 import cutlass.utils.datatypes as datatypes
 from cutlass.backend.utils.device import device_cc
+from utils import ExpectException


 class GemmEquivalence:
@ -220,38 +221,6 @@ class GemmEquivalenceTest(unittest.TestCase):
        gemm_eq.test_all()


-class ExpectException:
-    """
-    Utility class to assert that an exception was raised when expected
-
-    Example:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        with ExceptionExpected(True, 'Division by zero'):
-            x = 1.0 / 0.0
-
-    :param exception_expected: whether an exception is expected to be raised
-    :type exception_expected: bool
-    :param message: message to print if an exception is raised when not expected or vice versa
-    :type message: str
-    """
-    def __init__(self, exception_expected: bool, message: str = ''):
-        self.exception_expected = exception_expected
-        self.message = message
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, traceback):
-        exception_raised = exc_type is not None
-        assert self.exception_expected == exception_raised, self.message
-
-        # Suppress the exception
-        return True
-
-
 class GemmErrorTests(unittest.TestCase):
    """
    Tests various error scenarios that arise with the high-level Gemm interface
@ -316,9 +285,22 @@ class GemmErrorTests(unittest.TestCase):
            td.stages = 0
            plan.construct(td)

-        with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
-            td.stages = 3
-            plan.construct(td)
+        if cc < 90:
+            with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
+                td.stages = 3
+                plan.construct(td)
+        else:
+            original_kschedule = td.kernel_schedule
+            original_eschedule = td.epilogue_schedule
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+                td.epilogue_schedule = cutlass.EpilogueScheduleType.NoSmemWarpSpecialized
+                td.stages = 3
+                plan.construct(td)
+
+            # Reset schedules
+            td.kernel_schedule = original_kschedule
+            td.epilogue_schedule = original_eschedule

        with ExpectException(True, f'Requested too many stages'):
            td.stages = 100
@ -335,9 +317,25 @@ class GemmErrorTests(unittest.TestCase):
        # Reset cluster shape
        td.cluster_shape = cluster_shape

-        kernel_schedule = td.kernel_schedule
-        with ExpectException(cc < 90, f'Requested a persistent kernel on SM{cc}'):
+        with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.ScheduleAuto
+            plan.construct(td)
+
+        with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
+            td.kernel_schedule = cutlass.KernelScheduleType.ScheduleAuto
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
+            td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative
+            td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
+            td.tile_scheduler = cutlass.TileSchedulerType.StreamK
            plan.construct(td)

        # Ensure that all returned tile descriptions are unique
--- a/test/python/interface/utils.py
+++ b/test/python/interface/utils.py
@ -0,0 +1,65 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Helper functions & classes for interface test
+"""
+class ExpectException:
+    """
+    Utility class to assert that an exception was raised when expected
+
+    Example:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        with ExceptionExpected(True, 'Division by zero'):
+            x = 1.0 / 0.0
+
+    :param exception_expected: whether an exception is expected to be raised
+    :type exception_expected: bool
+    :param message: message to print if an exception is raised when not expected or vice versa
+    :type message: str
+    """
+    def __init__(self, exception_expected: bool, message: str = ''):
+        self.exception_expected = exception_expected
+        self.message = message
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        exception_raised = exc_type is not None
+        assert self.exception_expected == exception_raised, self.message
+
+        # Suppress the exception
+        return True