CUTLASS 3.2.1 (#1113)

* Updates for 3.2.1 release. * Minor fix in gemm op profiler for raster order. * Add scheduler mapping for raster order in the kernels.
2023-09-26 14:24:26 -07:00
parent e0aaa3c3b3
commit 90d3b0fb18
428 changed files with 22253 additions and 21762 deletions
--- a/test/python/backend/conv/init.py
+++ b/test/python/backend/conv/init.py
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -1,233 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,209 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=4, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=4, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -1,130 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32, 
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,127 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -1,196 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-def conv2d_few_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=1)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=2, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
@ -1,220 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-def conv2d_fixed_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -1,341 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
-                cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
-                cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
-                cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
-                cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 28),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 28),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 23, 56, 100),
-                cutlass_bindings.Tensor4DCoord(128, 3, 3, 100),
-                cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,86 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -1,128 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle2
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,139 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            )
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,285 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64], stages=3, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 56, 56, 12),
-                cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(2, 2),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 55, 55, 12),
-                cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(2, 2),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
@ -1,129 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            cutlass_bindings.float16
-        )
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            cutlass_bindings.float16
-        )
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,274 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 256, 32], stages=3, 
-            warp_count=[1, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
@ -1,128 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
@ -1,139 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=1)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 8, 8, 1),
-                cutlass_bindings.Tensor4DCoord(1, 3, 3, 1),
-                cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
--- a/test/python/backend/gemm/init.py
+++ b/test/python/backend/gemm/init.py
--- a/test/python/backend/gemm/gemm_bf16_sm80.py
+++ b/test/python/backend/gemm/gemm_bf16_sm80.py
@ -1,128 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmBF16TensorOpSm80(unittest.TestCase):
-    def SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32_64x128x64_32x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 128, 64],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 128, 32],
-            stages=6, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_bf16_sm90.py
+++ b/test/python/backend/gemm/gemm_bf16_sm90.py
@ -1,138 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-name_fn = partial(get_name, element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16, arch=90)
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass_bindings.bfloat16
-        element_B = cutlass_bindings.bfloat16
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst,
-            persistent=persistent
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmBF16Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [4, 4, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 5)
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None, persistent=True)
-add_test_simt(GemmBF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_f16_sm80.py
+++ b/test/python/backend/gemm/gemm_f16_sm80.py
@ -1,479 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmF16Sm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
-            direct_store=True
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    def test_SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32_128x128x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64],
-            stages=3, warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_256x128x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[256, 128, 64],
-            stages=3, warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 64, 64],
-            stages=3, warp_count=[2, 1, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float16
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32_64x64x32_32x32x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 32],
-            stages=10, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float16
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_256x128x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[256, 128, 64],
-            stages=3, warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_test_SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 64, 64],
-            stages=3, warp_count=[2, 1, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64],
-            stages=3, warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64],
-            stages=3, warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_f16_sm90.py
+++ b/test/python/backend/gemm/gemm_f16_sm90.py
@ -1,182 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-# Partial specialziation for naming tests
-name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-
-        element_A = cutlass_bindings.float16
-        element_B = cutlass_bindings.float16
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst,
-            persistent=persistent
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmF16Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-
-# Tests with 1x1x1 clusters
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], 5)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [2, 2, 2], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
-
-# Tests with different cluster shapes
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 2, 1], [64, 128, 64], None)
-
-# Tests for persistent warp-specialized threadblocks
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 2, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 4, 1], [64, 128, 64], None, persistent=True)
-
-# Tests using SIMT
-add_test_simt(GemmF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.NTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.TTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 8], 2)
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_f32_sm80.py
+++ b/test/python/backend/gemm/gemm_f32_sm80.py
@ -1,178 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.memory_manager import get_allocated_size
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmF32nF32nF32nTensorOpF32Sm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add_fast_bf16
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-
-    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_fast_accurate_f32_64x64x32_32x32x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add_fast_f32
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**24, 2**24)
-    cutlass.backend.compiler.load_from_cache()
-    unittest.main()
--- a/test/python/backend/gemm/gemm_f64_sm80.py
+++ b/test/python/backend/gemm/gemm_f64_sm80.py
@ -1,134 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmF64TensorOpSm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64_32x32x16_16x16x16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[8, 8, 4],
-            element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
-            element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[32, 32, 16],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        # alignment 1 restricted for double
-        A = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
-            alignment=1
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float64
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64_64x64x16_32x32x16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[8, 8, 4],
-            element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
-            element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 16],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        # alignment 1 restricted for double
-        A = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
-            alignment=1
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float64
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_f64_sm90.py
+++ b/test/python/backend/gemm/gemm_f64_sm90.py
@ -1,124 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-name_fn = partial(get_name, element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64, arch=90)
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass_bindings.float64
-        element_B = cutlass_bindings.float64
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmF64Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-add_test_simt(GemmF64Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float64, cutlass_bindings.float64, cutlass_bindings.float64, [1, 1, 1], [64, 64, 32], 2)
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_grouped_sm80.py
+++ b/test/python/backend/gemm/gemm_grouped_sm80.py
@ -1,235 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_grouped_testbed import TestbedGrouped
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmGroupedSm80(unittest.TestCase):
-    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
-            element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
-            opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(24))
-    
-    def test_SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64_64x64x16_32x32x16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[8, 8, 4], element_a=cutlass_bindings.float64,
-            element_b=cutlass_bindings.float64, element_accumulator=cutlass_bindings.float64,
-            opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 16],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float64
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(24))
-    
-    def test_SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32_128x64x8_64x32x1(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1], element_a=cutlass_bindings.float32,
-            element_b=cutlass_bindings.float32, element_accumulator=cutlass_bindings.float32,
-            opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 64, 8],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float32
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(27))
-    
-    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32_cache(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
-            element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
-            opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(5))
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_s8_sm80.py
+++ b/test/python/backend/gemm/gemm_s8_sm80.py
@ -1,261 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.epilogue import LinearCombinationClamp
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmS8TensorOpF32Sm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_64x64x64_32x32x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add_saturate
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 64],
-            stages=6, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajorInterleaved32,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
-            alignment=8
-        )
-
-        epilogue_functor = FastLinearCombinationClamp(
-            C.element, C.alignment
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "interleaved"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_256x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-
-        epilogue_functor = FastLinearCombinationClamp(
-            C.element, C.alignment
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_128x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-
-        epilogue_functor = FastLinearCombinationClamp(
-            C.element, C.alignment
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32_128x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.int32
-
-        epilogue_functor = LinearCombinationClamp(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            element_epilogue
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32_128x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.int32
-
-        epilogue_functor = LinearCombinationClamp(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            element_epilogue
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/backend/gemm/gemm_s8_sm90.py
+++ b/test/python/backend/gemm/gemm_s8_sm90.py
@ -1,154 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass_bindings.int8
-        element_B = cutlass_bindings.int8
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst,
-            persistent=persistent
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        if opclass == cutlass_bindings.OpClass.Simt:
-            epilogue_functor_cls = LinearCombinationClamp
-        else:
-            epilogue_functor_cls = LinearCombination
-        epilogue_functor = epilogue_functor_cls(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmS8Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-
-# Tests with 1x1x1 clusters
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNN, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], 3)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 8],  cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 64, 32], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [4, 4, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
-
-# Tests with different cluster shapes
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 2, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 4, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [4, 4, 1], [128, 128, 128], None)
-
-# Tests with persistent warp-specialized threadblocks
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 1, 1], [128, 128, 128], None, persistent=True)
-
-# Tests for SIMT
-add_test_simt(GemmS8Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 32, 8], 2)
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
--- a/test/python/conv2d/conv2d_test_utils.py
+++ b/test/python/conv2d/conv2d_test_utils.py
@ -1,508 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-"""
-Util Functions for Conv2d Test
-"""
-import torch
-import cutlass
-import unittest
-import cutlass_bindings
-from cutlass.utils.datatypes import binding_type, binding_opclass
-from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
-from cutlass.backend.utils.device import device_cc
-from cutlass.backend.test.utils import get_name_conv2d
-import numpy as np
-
-def conv2d_few_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-torch_dtype = {
-    cutlass.DataType.f16: torch.float16,
-    cutlass.DataType.f32: torch.float32,
-    cutlass.DataType.f64: torch.float64
-}
-
-numpy_dtype = {
-    cutlass.DataType.f16: np.float16,
-    cutlass.DataType.f32: np.float32,
-    cutlass.DataType.f64: np.float64
-}
-
-
-def validate_problem_size(ps, conv_kind, split_k_slices):
-    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
-    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
-    if P != ps.P or Q != ps.Q:
-        return False
-
-    # Split-K (serial or parallel) is not supported for strided dgrad
-    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
-        return False
-    return True
-
-
-# Override the backend launcher
-class Conv2dLauncherFrontend(Conv2dLauncher):
-    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
-        self.operation = plan
-        self.conv_kind = plan.conv_kind
-        self.seed = seed
-        self.backend = backend
-        
-        self.dtype_A = plan._element_a
-        self.dtype_B = plan._element_b
-        self.dtype_C = plan._element_c
-        self.dtype_acc = plan._element_accumulator
-        
-        self.layout_A = cutlass_bindings.TensorNHWC
-        self.layout_B = cutlass_bindings.TensorNHWC
-        self.layout_C = cutlass_bindings.TensorNHWC
-        self.layout_D = cutlass_bindings.TensorNHWC
-        
-        self.element_compute = cutlass_bindings.float32
-        self.enable_cached_results = True
-        
-        # Get randomization_max
-        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
-            if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
-                self.randomization_max = 2
-            else:
-                self.randomization_max = 3
-        else:
-            self.randomization_max = 7
-            
-        self.activation = plan.activation
-        
-        self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
-            
-    
-    def set_seed(self):
-        if self.backend == "numpy":
-            np.random.seed(self.seed)
-        else:
-            torch.manual_seed(self.seed)
-    
-    def uniform_init(self, size, dtype):
-        if self.backend == "numpy":
-            return super().uniform_init(size, numpy_dtype[dtype])
-        else:
-            tensor = torch.ceil(
-                torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
-            ).to(memory_format=torch.channels_last)
-            return tensor
-    
-    def zeros_like(self, tensor):
-        if self.backend == "numpy":
-            return np.zeros_like(tensor)
-        else:
-            return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
-    
-    def reference(self, ps, A, B, C, alpha, beta, activation):
-        if self.backend == "numpy":
-            numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
-            return numpy_result
-        else:
-            if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
-                torch_result = alpha * torch.ops.aten.conv2d(
-                    A,
-                    B,
-                    stride=(ps.stride_h, ps.stride_w),
-                    padding=(ps.pad_h, ps.pad_w),
-                    dilation=(ps.dilation_h, ps.dilation_w)
-                ) + beta * C
-            elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
-                torch_result = alpha * torch.nn.grad.conv2d_input(
-                    (ps.N, ps.C, ps.H, ps.W),
-                    B,
-                    A,
-                    padding=(ps.pad_h, ps.pad_w),
-                    stride=(ps.stride_h, ps.stride_w)
-                ) + beta * C
-            elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
-                torch_result = alpha * torch.nn.grad.conv2d_weight(
-                    B,
-                    (ps.K, ps.C, ps.R, ps.S),
-                    A,
-                    padding=(ps.pad_h, ps.pad_w),
-                    stride=(ps.stride_h, ps.stride_w)
-                ) + beta * C
-            else:
-                raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
-            
-            if activation == cutlass.backend.epilogue.relu:
-                torch_result = torch.nn.functional.relu(torch_result)
-            elif activation == cutlass.backend.epilogue.leaky_relu:
-                torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
-            
-            return torch_result
-    
-    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
-        if self.element_compute == cutlass_bindings.float16:
-            alpha = cutlass_bindings.float16(alpha)
-            beta = cutlass_bindings.float16(beta)
-        elif self.element_compute == cutlass_bindings.int32:
-            alpha = int(alpha)
-            beta = int(beta)
-        else:
-            alpha = alpha
-            beta = beta
-
-        # If cached result is loaded
-        cached_result_loaded = False
-
-        if self.enable_cached_results:
-            # Get problem key
-            cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
-                self.conv_kind,
-                problem_size,
-                alpha,
-                beta,
-                getTensorView(
-                    tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
-                ),
-                getTensorView(
-                    tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
-                ),
-                getTensorView(
-                    tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
-                ),
-            )
-            
-            cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
-
-            cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
-
-            conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
-                self.operation.arch,
-                self.seed,
-            )
-
-            cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
-                conv2d_result_cache_name
-            )
-            # CachedTestResultListing cached_results(conv2d_result_cache_name);
-            cached = cached_results.find(cached_test_key)
-            cached_result_loaded = cached[0]
-            if cached_result_loaded:
-                cached_test_result = cached[1]
-
-        if not cached_result_loaded:
-            # Compute the conv2d on host
-            tensor_D_ref = np.ones_like(tensor_C)
-            tensor_ref_A = getTensorRef(
-                tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
-            )
-            tensor_ref_B = getTensorRef(
-                tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
-            )
-            tensor_ref_C = getTensorRef(
-                tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
-            )
-            tensor_ref_D_ref = getTensorRef(
-                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-
-            self.host_conv2d(
-                self.conv_kind,
-                problem_size,
-                tensor_ref_A,
-                tensor_ref_B,
-                tensor_ref_C,
-                tensor_ref_D_ref,
-                alpha,
-                beta,
-            )
-            
-            if activation == cutlass.backend.epilogue.leaky_relu:
-                tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
-            else:
-                tensor_D_ref = activation.numpy(tensor_D_ref)
-
-            tensor_view_D_ref = getTensorView(
-                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-
-            if self.enable_cached_results:
-                cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
-                    tensor_view_D_ref
-                )
-                cached_results = (
-                    cutlass_bindings.test.conv.host.CachedTestResultListing(
-                        conv2d_result_cache_name
-                    )
-                )
-                cached_results.append(cached_test_key, cached_test_result)
-                cached_results.write(conv2d_result_cache_name)
-            else:
-                return tensor_D_ref
-
-        return cached_test_result.D
-    
-    def equal(self, tensor_D, tensor_D_ref, problem_size):
-        if self.backend == "numpy":
-            return super().equal(tensor_D, tensor_D_ref, problem_size)
-        else:
-            torch.cuda.synchronize()
-            return torch.equal(tensor_D, tensor_D_ref)
-                
-    
-    def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
-        
-        #
-        # Initialize input and output tensors
-        #
-        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
-            if self.backend == "torch":
-                tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
-                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
-                tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
-            else:
-                tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
-                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
-                tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
-        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
-            if self.backend == "torch":
-                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
-                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
-                tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
-            else:
-                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
-                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
-                tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
-        elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
-            if self.backend == "torch":
-                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
-                tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
-                tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
-            else:
-                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
-                tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
-                tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
-        else:
-            raise Exception(f"Conv kind {self.conv_kind} is not supported")
-
-        self.set_seed()
-
-        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
-        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
-        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
-        tensor_D = self.zeros_like(tensor_C)
-        
-        self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D, 
-            stride=(ps.stride_h, ps.stride_w),
-            padding=(ps.pad_h, ps.pad_w),
-            dilation=(ps.dilation_h, ps.dilation_w),
-            alpha=alpha, beta=beta,
-            split_k=(split_k_mode, split_k_slices))
-        
-        tensor_D_ref = self.reference(
-            ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
-        )
-        
-        return self.equal(tensor_D, tensor_D_ref, ps)
-
-
-def add_test(
-    cls, 
-    cc, 
-    conv_kind,
-    problem_sizes,
-    element,
-    element_accumulator,
-    element_output,
-    opclass,
-    threadblock_shape,
-    warp_count,
-    instruction_shape,
-    stages,
-    iterator_algorithm=None,
-    swizzle=None,
-    split_k_mode="serial",
-    split_k_slices=1,
-    activation = "identity"
-):
-    """Create a test-running function with the given specification"""
-    test_name = get_name_conv2d(
-        cc, conv_kind, element, element_accumulator,
-        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
-        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
-    
-    def run(self):
-        # Create the plan
-        plan = cutlass.Conv2d(
-            kind=conv_kind,
-            element=element,
-            element_accumulator=element_accumulator,
-            element_C=element_output,
-            element_D=element_output
-        )
-        
-        # Set the opclass
-        plan.opclass = opclass
-        # Set the tile description
-        td = {
-            "threadblock_shape": threadblock_shape,
-            "warp_count": warp_count,
-            "stages": stages,
-            "instruction_shape": instruction_shape,
-        }
-
-        plan.tile_description = td
-        # Set iterator algorithm
-        if iterator_algorithm is not None:
-            plan.iterator_algorithm = iterator_algorithm
-        # Set swizzling functor
-        if swizzle is not None:
-            plan.swizzling_stride = swizzle
-        
-        if activation != "identity":
-            if activation == "leaky_relu":
-                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
-            else:
-                plan.activation = getattr(cutlass.epilogue, activation)
-        
-        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
-        
-        for ps in problem_sizes:
-            if not validate_problem_size(ps, conv_kind, split_k_slices): continue
-            
-            self.assertTrue(
-                conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
-            )
-    
-    setattr(cls, test_name, run)
-    
-    return run
-
-
-def get_conv_problems():  
-    # 64: minimum channel size
-    conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
-    # Insert alignment 4 & 2 tests
-    conv_problems += [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
-            cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
-            cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-    
-    return conv_problems
--- a/test/python/cutlass/conv2d/conv2d_problem_sizes.py
+++ b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
@ -0,0 +1,660 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for defining Conv2D problem sizes for testing.
+
+This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
+"""
+
+import cutlass
+from cutlass import ConvMode
+from cutlass.shape import Conv2DProblemSize
+
+
+class TestbedConv2dProblemSizes:
+    def __init__(self, minimum_channel_size: int):
+        conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
+        conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
+        conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
+        conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
+        grouped_sizes = self.initialize_conv2d_grouped_sizes()
+
+        # Filter all problems
+        self.all = []
+        for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
+            for size in size_list:
+                if (size.C // size.groups) % minimum_channel_size == 0:
+                    self.all.append(size)
+
+
+    def initialize_conv2d_default_sizes(self, minimum_channel_size):
+        # Small input size x stride (1,1)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+
+        conv2d_default_sizes = []
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 1, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 8, minimum_channel_size,
+          8, 1, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 8, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 9, minimum_channel_size,
+          8, 4, 4, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          2, 7, 9, minimum_channel_size,
+          8, 5, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 6, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 7, 7, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##############################################
+        # Small input size x stride (2,2)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##############################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 11, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 17, 19, minimum_channel_size,
+          16, 2, 2, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 5, minimum_channel_size,
+          16, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 17, 8,
+          24, 3, 3, 8,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 8,
+          24, 3, 3, 8,
+          1, 1,
+          3, 3,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 20, 24, 8,
+          40, 3, 3, 8,
+          3, 3,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 160,
+          224, 1, 1, 160,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 19, 37, 160,
+          224, 3, 3, 160,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 16, 160,
+          224, 2, 3, 160,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 128,
+          224, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 29, 37, 160,
+          224, 5, 5, 160,
+          2, 2,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 32 + minimum_channel_size,
+          96, 3, 3, 32 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 24, 64 + minimum_channel_size,
+          96, 3, 3, 64 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 16, 288,
+          160, 5, 5, 288,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 55, 51, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 71, 80, 32,
+          64, 5, 5, 32,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 224, 224, 8,
+          64, 7, 7, 8,
+          3, 3,
+          2, 2,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size stride (3, 3), filter (3, 3), non-default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 23, 256,
+          512, 3, 3, 256,
+          0, 0,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size padding > stride, asymmetric filter, padding and striding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 31, 256,
+          512, 3, 3, 256,
+          5, 7,
+          3, 4,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 35, 256,
+          512, 7, 5, 256,
+          11, 7,
+          3, 5,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size *mixed* stride (1, 2) and (2, 1),
+        # filter (3, 3), default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          1, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          2, 1,
+          1, 1,
+        ))
+
+        ######################################/
+        # Additional input size
+        ######################################/
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 28, 28, 256,
+          256, 2, 2, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+           1, 32, 32, 16,
+           32, 3, 3, 16,
+           1, 1,
+           6, 2,
+           1, 1,
+         ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          32, 24, 32, 32,
+          32, 1, 2, 32,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          4, 2, 3, 256,
+          328, 3, 5, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        return conv2d_default_sizes
+
+    # Add a few large and rigorous convolution problem sizes
+    def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
+        sizes = []
+        if False:
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 124, 224, 2 * minimum_channel_size),
+              (24, 7, 7, 2 * minimum_channel_size),
+            ))
+
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 233, 35, minimum_channel_size),
+              (24, 7, 5, minimum_channel_size),
+            ))
+        return sizes
+
+    # Add resent50 layers to unit testing sizes
+    def initialize_conv2d_resnet50_sizes(self, batch_size):
+        conv2d_problem_vector = []
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          256, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 3, 3, 64,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          64, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          128, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          128, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          512, 1, 1, 128,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          128, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          1024, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          256, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          256, 3, 3, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          1024, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          256, 1, 1, 1024,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          2048, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          512, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          512, 3, 3, 512,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          2048, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 2048,
+          512, 1, 1, 2048,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        return conv2d_problem_vector
+
+    def initialize_conv2d_grouped_sizes(self):
+        threadblock_n = 128
+        threadblock_k = 32
+
+        sizes = []
+        ##########################################
+        # One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+        # One CTA calculates a single group
+        ##########################################
+        for cta_per_group_k in range(1, 4):
+            for groups in range(2, 5):
+                conv_k = cta_per_group_k * threadblock_n * groups
+                sizes.append(Conv2DProblemSize(
+                  1, 8, 8, threadblock_k * 2 * groups,
+                  conv_k, 3, 3, threadblock_k * 2,
+                  1, 1,
+                  1, 1,
+                  1, 1,
+                  ConvMode.CrossCorrelation,
+                  1,
+                  groups
+                ))
+
+        # Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n * 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        sizes.append(Conv2DProblemSize(
+          1, 56, 56, 696,
+          768, 3, 3, 232,
+          1, 1,
+          2, 2,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+        sizes.append(Conv2DProblemSize(
+          1, 14, 14, 1392,
+          1536, 3, 3, 232,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+
+        ##########################################
+        # One CTA calculate multiple groups: CTA::N % k_per_group = 0
+        ##########################################
+
+        # 2 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 4,
+          threadblock_n, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 2 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 4 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 8,
+          threadblock_n // 2, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        # 4 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 2,
+          threadblock_n // 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        return sizes
--- a/test/python/cutlass/conv2d/conv2d_sm80.py
+++ b/test/python/cutlass/conv2d/conv2d_sm80.py
@ -31,56 +31,64 @@
 #################################################################################################

 """
-Low-level functionality tests for Conv2d operands on SM80
+Low-level functionality tests for Conv2d opreations on SM80
 """
-from conv2d_test_utils import *
-import cutlass
+
 import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from conv2d_test_utils import *


 cutlass.set_log_level(logging.WARNING)
 cc = 80

-@unittest.skipIf(device_cc() != cc, 'Device compute capability is invalid for SM80 tests.')
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
 class Conv2dSm80(unittest.TestCase):
    """
    Wrapper class to which tests will be added dynamically in __main__
    """
    pass

+
 conv_problems = get_conv_problems()

+
 # Tests for optimized & analytic
 for conv_kind in ["fprop", "wgrad", "dgrad"]:
    # F16, simt
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="simt", threadblock_shape=[128, 128, 8], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="simt", threadblock_shape=[128, 128, 8],
        warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
    # F16, tensor op
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
    # F16, tensor op, analytic iterator
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
    # F16, tensor op, f32 output
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
    # F16, tensor op, different tile description
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 64, 32], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
    # F32, simt
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, 
-        opclass="simt", threadblock_shape=[128, 128, 8], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="simt", threadblock_shape=[128, 128, 8],
        warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
    # Tf32, tensorop
    add_test(
@ -90,19 +98,19 @@ for conv_kind in ["fprop", "wgrad", "dgrad"]:
    )
    # Split-K
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
        split_k_slices=2)
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
        split_k_slices=5)
    # Swizzling functor
    add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 64, 32], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)

 # Tests for few channels and fixed channels
@ -113,14 +121,14 @@ for c, tb, stage, inst in zip([2, 1],
                                [[16, 8, 16], [16, 8, 8]]):
    add_test(
        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
-        opclass="tensor_op", threadblock_shape=tb, 
+        opclass="tensor_op", threadblock_shape=tb,
        warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
    )
 # F16, tensor op, fixed channels
 for c in [8, 4, 2]:
    add_test(
        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
    )

@ -128,11 +136,11 @@ for c in [8, 4, 2]:
 for activation in ["relu", "leaky_relu"]:
    for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
        add_test(
-            Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-            opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+            Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+            opclass="tensor_op", threadblock_shape=[128, 128, 64],
            warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
            split_k_slices=split_k_slices, activation=activation)
-    
+

 if __name__ == '__main__':
    unittest.main()
--- a/test/python/cutlass/conv2d/conv2d_test_utils.py
+++ b/test/python/cutlass/conv2d/conv2d_test_utils.py
@ -0,0 +1,425 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for Conv2d tests.
+"""
+
+import torch
+
+import cutlass
+from cutlass import (
+    ConvKind,
+    ConvMode,
+    DataType,
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+)
+from cutlass.backend.utils.software import SubstituteTemplate
+from cutlass.shape import Conv2DProblemSize
+from cutlass.utils.datatypes import numpy_type, torch_type
+
+from conv2d_problem_sizes import TestbedConv2dProblemSizes
+
+
+def get_name_conv2d(
+    arch,
+    conv_kind,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm,
+    swizzle,
+    split_k_mode,
+    split_k_slices,
+    activation
+):
+    """
+    Generates a procedural name for a test case for conv2d
+
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
+    :type conv_kind: str
+    :param iterator_algorithm: the iterator algorithm applied
+    :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param element_accumulator: data type used in accumulation
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param stride_support: stride support of dgrad
+    :param alignment: int
+    :type alignment: int
+
+    :return: str
+    """
+    if iterator_algorithm is None:
+        iterator_algorithm = "AUTO"
+    if swizzle is None:
+        swizzle = 1
+    name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
+
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "conv_kind": conv_kind,
+            "iter_alg": iterator_algorithm,
+            "eA": DataTypeNames[element],
+            "eB": DataTypeNames[element],
+            "eC": DataTypeNames[element_output],
+            "opclass": opclass,
+            "acc": DataTypeNames[element_accumulator],
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "wM": str(threadblock_shape[0] // warp_count[0]),
+            "wN": str(threadblock_shape[1] // warp_count[1]),
+            "wK": str(threadblock_shape[2] // warp_count[2]),
+            "IM": str(instruction_shape[0]),
+            "IN": str(instruction_shape[1]),
+            "IK": str(instruction_shape[2]),
+            "stages": str(stages),
+            "swizzle": str(swizzle),
+            "split_k_mode": split_k_mode,
+            "split_k_slices": str(split_k_slices),
+            "activation": activation
+        }
+    )
+
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        Conv2DProblemSize(
+            1, 8, 8, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            32, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 7, 7, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+
+
+class Conv2dLauncherFrontend:
+    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        self.layout_A = LayoutType.TensorNHWC
+        self.layout_B = LayoutType.TensorNHWC
+        self.layout_C = LayoutType.TensorNHWC
+        self.layout_D = LayoutType.TensorNHWC
+
+        self.element_compute = DataType.f32
+
+        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+            self.rand_max = 1
+        else:
+            self.rand_max = 4
+        self.activation = plan.activation
+
+    def uniform_init(self, size, dtype):
+        tensor = torch.ceil(
+            torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
+        ).to(memory_format=torch.channels_last)
+        return tensor
+
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.conv_kind == ConvKind.Fprop:
+            torch_result = alpha * torch.ops.aten.conv2d(
+                A,
+                B,
+                stride=(ps.stride_h, ps.stride_w),
+                padding=(ps.pad_h, ps.pad_w),
+                dilation=(ps.dilation_h, ps.dilation_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Dgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_input(
+                (ps.N, ps.C, ps.H, ps.W),
+                B,
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Wgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_weight(
+                B,
+                (ps.K, ps.C, ps.R, ps.S),
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+
+        if activation == cutlass.backend.epilogue.relu:
+            torch_result = torch.nn.functional.relu(torch_result)
+        elif activation == cutlass.backend.epilogue.leaky_relu:
+            torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+        return torch_result
+
+    def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        if self.conv_kind == ConvKind.Fprop:
+            tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+        elif self.conv_kind == ConvKind.Dgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+        elif self.conv_kind == ConvKind.Wgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+
+        torch.manual_seed(self.seed)
+
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
+        self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+
+        tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
+
+        torch.cuda.synchronize()
+        passed = torch.equal(tensor_D, tensor_D_ref)
+
+        return passed
+
+
+def add_test(
+    cls,
+    cc,
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+
+    def run(self):
+        # Create the plan
+        plan = cutlass.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass.epilogue, activation)
+
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
+
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices): continue
+
+            self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
+
+    setattr(cls, test_name, run)
+
+    return run
+
+
+def get_conv_problems():
+    # 64: minimum channel size
+    conv_problems = TestbedConv2dProblemSizes(64).all
+
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        Conv2DProblemSize(
+            1, 4, 4, 12,
+            8, 3, 3, 12,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 4, 4, 14,
+            8, 3, 3, 14,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 23, 56, 98,
+            128, 3, 3, 98,
+            4, 5,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return conv_problems
--- a/test/python/cutlass/conv2d/run_all_tests.py
+++ b/test/python/cutlass/conv2d/run_all_tests.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@ -30,13 +30,15 @@
 #
 #################################################################################################

-import cutlass.backend
+import pathlib
 import unittest
-from cutlass.backend.memory_manager import *
+

 if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**32, 2**32)
    loader = unittest.TestLoader()
-    tests = loader.discover('./', 'conv2d_*.py')
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'conv2d_*.py')
    testRunner = unittest.runner.TextTestRunner()
-    testRunner.run(tests)
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
--- a/test/python/cutlass/emit/pytorch.py
+++ b/test/python/cutlass/emit/pytorch.py
@ -39,7 +39,6 @@ import tempfile
 import unittest

 import cutlass
-import cutlass_bindings

 if cutlass.utils.datatypes.torch_available:
    import torch
@ -94,7 +93,7 @@ def _generate_conv2d_problem(conv_kind, dtype, ps):
    :type conv_kind: str
    :param dtype: data type of tensors
    :param problem_size: the conv2d problem size
-    :type problem_size: cutlass_bindings.conv.Conv2dProblemSize
+    :type problem_size: cutlass.shape.Conv2DProblemSize

    :return: initialized tensors A, B, C, and D
    :rtype: list
@ -196,13 +195,11 @@ class PyTorchExtensionTest(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmpdir:
            mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
        
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
            1, 1
        )
        
@ -239,13 +236,13 @@ class PyTorchExtensionTest(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmpdir:
            mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
        
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            cutlass.ConvMode.CrossCorrelation,
            1, 1
        )
        
@ -273,13 +270,13 @@ class PyTorchExtensionTest(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmpdir:
            mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
        
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            cutlass.ConvMode.CrossCorrelation,
            1, 1
        )
        
--- a/test/python/cutlass/evt/evt_compute_sm80_90.py
+++ b/test/python/cutlass/evt/evt_compute_sm80_90.py
@ -0,0 +1,100 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for compute node in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass import swizzle
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTComputeSM90(EVTTestCaseBase):
+
+    def test_arith(self):
+        """
+        Test Arithmatic op
+        """
+        def evt_arith_compute(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_func_call(self):
+        """
+        Test Function call
+        """
+        def evt_func_call(accum, C, alpha, beta, gamma):
+            D = multiply_add(relu(accum + alpha) + C, beta, gamma)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_layout_sm80_90.py
+++ b/test/python/cutlass/evt/evt_layout_sm80_90.py
@ -0,0 +1,173 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTLayoutSM90(EVTTestCaseBase):
+
+    def test_permute_1(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D_permute = F_permute + permute(C, indices=(0, 2, 1))
+            D = permute(D_permute, indices=(0, 2, 1))
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
+    def test_permute_2(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, n, m)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, n, m)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
+    def test_permute_3(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(1, 0, 2))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (m, l, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (m, l, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_reshape(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            E_reshape = reshape(TensorE, new_shape=(512, 1))
+            D = F + E_reshape
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (16, 32)),
+            "D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+    def test_reshape2(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
+            D = F_reshape + TensorE
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
+            "D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_load_sm80_90.py
+++ b/test/python/cutlass/evt/evt_load_sm80_90.py
@ -0,0 +1,142 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for load nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTLoadSM90(EVTTestCaseBase):
+
+    def test_tensor_load(self):
+        """
+        Load extra tensor with shape [m, n]
+        """
+        def evt_tensor_load(accum, C, aux, aux_batch):
+            D = accum + C + aux + aux_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "aux_batch": self.fake_tensor(np.float32, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
+            input_keys = ["C", "aux", "aux_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_broadcast(self):
+        """
+        Load extra tensor with shape [1, n]
+        """
+        def evt_row_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (n,)),
+                "bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_column_broadcast(self):
+        """
+        Load extra tensor with shape [m, 1]
+        """
+        def evt_column_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m, 1)),
+                "bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_broadcast(self):
+        """
+        Load extra tensor with shape [1, 1]
+        """
+        def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
+            D = accum + C + alpha + alpha_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
+            input_keys = ["C", "alpha", "alpha_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_mixed_sm80_90.py
+++ b/test/python/cutlass/evt/evt_mixed_sm80_90.py
@ -0,0 +1,274 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unittest for mixed types of nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass.swizzle import ThreadblockSwizzleStreamK
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTMixedSM90(EVTTestCaseBase):
+    def test_mixed_dag(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        if device_cc() == 80:
+            aligments = [2, 4, 8]
+        else:
+            # Sm90 EVT currently only supports 128-bit alignment
+            aligments = [8,]
+        for align in aligments:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(self.element, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(self.element, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(self.element, (l, m, n)),
+                    "cbias": self.fake_tensor(self.element, (m, 1)),
+                    "rbias": self.fake_tensor(self.element, (n,)),
+                    "D": self.fake_tensor(self.element, (l, m, n)),
+                    "F": self.fake_tensor(self.element, (l, m, n)),
+                    "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                    "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                }
+
+                launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_float(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for align in [3, 2, 4]:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(np.float32, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(np.float32, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(np.float32, (l, m, n)),
+                    "cbias": self.fake_tensor(np.float32, (m, 1)),
+                    "rbias": self.fake_tensor(np.float32, (n,)),
+                    "D": self.fake_tensor(np.float32, (l, m, n)),
+                    "F": self.fake_tensor(np.float32, (l, m, n)),
+                    "F_row_max": self.fake_tensor(np.float32, (n,)),
+                    "E_col_max": self.fake_tensor(np.float32, (m, 1))
+                }
+                launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_stage2(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_partition_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            tile_description = {
+                "threadblock_shape": [128, 128, 64],
+                "warp_count": [2, 2, 2]
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_stream_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        # High per-sm occupancy tile_description
+        tile_description = {
+            "threadblock_shape": [128, 128, 32],
+            "warp_count": [2, 2, 1],
+            "stages": 3
+        }
+        tds = [None, tile_description]
+        for td in tds:
+            for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
+                if l == 1:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (m, n)),
+                        "F": self.fake_tensor(self.element, (m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                else:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (l, m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (l, m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (l, m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (l, m, n)),
+                        "F": self.fake_tensor(self.element, (l, m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+
+                if td is not None:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        tile_description=td,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                else:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_mixed_dag_no_batch(self):
+        def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, _ in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (m, n)),
+                "F": self.fake_tensor(self.element, (m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, 1)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/evt_store_sm80_90.py
+++ b/test/python/cutlass/evt/evt_store_sm80_90.py
@ -0,0 +1,155 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTStoreSM90(EVTTestCaseBase):
+
+    def test_aux_store(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_aux_store(accum, alpha, C):
+            F = alpha * accum
+            D = F + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_col_reduce(self):
+        """
+        Reduction [m, n] -> [m, 1]
+        """
+        def evt_row_reduce(accum, alpha, C):
+            acc_row_max = max(accum, dim=[2,])
+            F = alpha * accum
+            F_row_max = max(F, dim=[0, 2])
+            D = F + C
+            return D, F_row_max, acc_row_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(np.float32, (m, 1)),
+                "acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_row_max", "acc_row_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_reduce(self):
+        """
+        Reduction [m, n] -> [n]
+        """
+        def evt_col_reduce(accum, alpha, C):
+            acc_col_max = max(accum, dim=[1,])
+            F = alpha * accum
+            F_col_max = max(F, dim=[0, 1])
+            D = F + C
+            return D, F_col_max, acc_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_col_max": self.fake_tensor(np.float32, (n,)),
+                "acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_col_max", "acc_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_reduce(self):
+        """
+        Reduction [m, n] -> [1,]
+        """
+        def evt_scalar_reduce(accum, alpha, C):
+            acc_max = max(accum, dim=[1, 2])
+            F = alpha * accum
+            F_max = max(F, dim=[0, 1, 2])
+            D = F + C
+            return D, F_max, acc_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
+                "F_max": self.fake_tensor(np.float32, (1,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_max", "acc_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/evt/run_all_tests.py
+++ b/test/python/cutlass/evt/run_all_tests.py
@ -30,12 +30,14 @@
 #
 #################################################################################################

+import pathlib
 import unittest


 if __name__ == '__main__':
    loader = unittest.TestLoader()
-    tests = loader.discover('./', 'gemm_*.py')
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'evt_*.py')
    testRunner = unittest.runner.TextTestRunner()
    results = testRunner.run(tests)
    if not results.wasSuccessful():
--- a/test/python/cutlass/evt/utils/evt_testbed.py
+++ b/test/python/cutlass/evt/utils/evt_testbed.py
@ -0,0 +1,230 @@
+################################################################################
+#
+# Copyright (c) 20123 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Testbed classes of EVT
+"""
+
+import torch
+import unittest
+
+import cutlass
+from cutlass import Tensor
+import cutlass.backend.evt
+from cutlass.profiler import CUDAEventProfiler
+from cutlass.shape import GemmCoord
+from cutlass.utils.datatypes import torch_type
+
+
+class EVTReferenceModule:
+    def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.epilogue_visitor = epilogue_visitor
+
+    def run(self, A, B, C, problem_size, alpha, beta, batch=1):
+        if self.layout_A == cutlass.LayoutType.RowMajor:
+            A_row = A.view((batch, problem_size.m, problem_size.k))
+        else:
+            A_col = A.view((batch, problem_size.k, problem_size.m))
+            A_row = torch.permute(A_col, (0, 2, 1))
+
+        if self.layout_B == cutlass.LayoutType.RowMajor:
+            B_row = B.view((batch, problem_size.k, problem_size.n))
+        else:
+            B_col = B.view((batch, problem_size.n, problem_size.k))
+            B_row = torch.permute(B_col, (0, 2, 1))
+
+        if self.layout_C == cutlass.LayoutType.RowMajor:
+            C_row = C.view((batch, problem_size.m, problem_size.n))
+        else:
+            C_col = C.view((batch, problem_size.n, problem_size.m))
+            C_row = torch.permute(C_col, (0, 2, 1))
+
+        out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
+
+        if self.layout_C == cutlass.LayoutType.ColumnMajor:
+            out = torch.permute(out_row, (0, 2, 1))
+        else:
+            out = out_row
+
+        return torch.flatten(out)
+
+    def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
+        # Running the mainloop
+        accum = self.run(
+            A, B, C, problem_size, 1.0, 0.0, batch=batch
+        ).reshape(batch, problem_size.m, problem_size.n)
+        
+        # Running the epilogue
+        epilogue_args["accum"] = accum
+        references = self.epilogue_visitor(**epilogue_args)
+        
+        # Return the results
+        if not isinstance(references, tuple):
+            references = (references,)
+        return references
+        
+
+class EVTTestBed:
+    """
+    Epilogue Visitor Testbed
+    """
+    def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
+        self.element = element
+        layout = cutlass.LayoutType.RowMajor
+        self.example_inputs = example_inputs
+        
+        # Create the Gemm plan
+        self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
+        
+        if "tile_description" in kwargs:
+            self.plan.tile_description = kwargs["tile_description"]
+        
+        if "swizzling_functor" in kwargs:
+            self.plan.swizzling_functor = kwargs["swizzling_functor"]
+        
+        # Compile the epilogue visitor
+        epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
+        if "epilogue_stages" in kwargs:
+            epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
+        self.plan.epilogue_visitor = epilogue_visitor
+        
+        # Reference model
+        self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
+        
+        self.profile = profile
+
+    def get_torch_tensor(self, shape, dtype=None, fill=None):
+        if dtype is None:
+            dtype = self.element
+        
+        dtype = torch_type(dtype)
+        if fill is None:
+            return torch.ceil(
+                torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
+            )
+        else:
+            return torch.full(shape, fill, dtype=dtype, device="cuda")
+    
+    def verify(self, problem_size, input_keys, result_keys, batch_count=1):
+        """
+        Verify the results
+        """
+        problem_size = GemmCoord(*problem_size)
+
+        # Initiate the GEMM arguments
+        tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
+        tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
+        
+        # Initialize the epilogue args
+        epilogue_args = {}
+        for key in self.example_inputs.keys():
+            if key in input_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
+                else:
+                    epilogue_args[key] = tensor
+            elif key in result_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    if "max" in key:
+                        fill = -1000
+                    else:
+                        fill = 0
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
+                else:
+                    epilogue_args[key] = tensor
+        
+        tensor_D = epilogue_args["D"]
+        if "C" in epilogue_args:
+            tensor_C = epilogue_args["C"]
+        else:
+            tensor_C = tensor_D
+        # Run the device kernel
+        self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
+        
+        # Run the host reference
+        evt_args_inputs = {}
+        for key in input_keys:
+            evt_args_inputs[key] = epilogue_args[key]
+        
+        reference_results = self.reference_fn(
+            tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
+        
+        # Compare the results
+        for result, ref in zip(result_keys, reference_results):
+            assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
+        
+        # Run profile
+        if self.profile:
+            profiler = CUDAEventProfiler(
+                self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
+                visitor_args = epilogue_args
+            )
+            print(f"Cutlass Python Duration: {profiler()}")
+
+
+class EVTTestCaseBase(unittest.TestCase):
+    """
+    Base class for EVT Unittest
+    """
+    def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
+        super().__init__(methodName)
+        
+        self.element = cutlass.DataType.f16
+        self.l, self.m, self.n, self.k = lmnk
+        
+        self.problem_size = (self.m, self.n, self.k)
+        
+        torch.random.manual_seed(42)
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
+    
+    def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
+        k = k if k else self.k
+        problem_size_m = [alignment, 512 - 3 * alignment]
+        problem_size_n = [alignment, 512 - alignment]
+        if alignment % 8 == 0:
+            problem_size_m.append(768)
+            problem_size_n.append(768)
+        problem_size_l = batch_count
+        problem_sizes = []
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for l in problem_size_l:
+                    problem_sizes.append((m, n, k, l))
+        
+        return problem_sizes
--- a/test/python/cutlass/gemm/gemm_batched.py
+++ b/test/python/cutlass/gemm/gemm_batched.py
@ -35,15 +35,15 @@ High-level tests for running batched GEMMs
 """

 from functools import partial
-from math import prod
-
-import cutlass
 import logging
-import torch
+from math import prod
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
+import torch
+
+from utils import LayoutCombination, add_test_gemm

 cutlass.set_log_level(logging.WARNING)

@ -130,10 +130,5 @@ class GemmF16Batched(unittest.TestCase):
        self.run_batched((3,), False, True, False)
        self.run_batched((2, 3), False, True, False)

-    def test_batched_C(self):
-        self.run_batched((3,), False, False, True)
-        self.run_batched((2, 3), False, False, True)
-
-
 if __name__ == '__main__':
    unittest.main()
--- a/test/python/cutlass/gemm/gemm_f16_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm80.py
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F16 operands on SM80
 """

 from functools import partial
-
-import cutlass
 import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc

+from utils import LayoutCombination, add_test_gemm
+

 cutlass.set_log_level(logging.WARNING)
 cc = 80
--- a/test/python/cutlass/gemm/gemm_f16_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm90.py
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F16 operands on SM90
 """

 from functools import partial
-
-import cutlass
 import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc

+from utils import LayoutCombination, add_test_gemm
+

 cutlass.set_log_level(logging.WARNING)
 cc = 90
--- a/test/python/cutlass/gemm/gemm_f32_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f32_sm80.py
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F32 operands on SM80
 """

 from functools import partial
-
-import cutlass
 import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc

+from utils import LayoutCombination, add_test_gemm
+

 cutlass.set_log_level(logging.WARNING)
 cc = 80
--- a/test/python/cutlass/gemm/gemm_f64_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm80.py
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F64 operands on SM80
 """

 from functools import partial
-
-import cutlass
 import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc

+from utils import LayoutCombination, add_test_gemm
+

 cutlass.set_log_level(logging.WARNING)
 cc = 80
--- a/test/python/cutlass/gemm/gemm_f64_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm90.py
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F64 operands on SM90
 """

 from functools import partial
-
-import cutlass
 import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc

+from utils import LayoutCombination, add_test_gemm
+

 cutlass.set_log_level(logging.WARNING)
 cc = 90
--- a/test/python/cutlass/gemm/gemm_s8_sm80.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm80.py
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with S8 operands on SM80
 """

 from functools import partial
-
-import cutlass
 import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc

+from utils import LayoutCombination, add_test_gemm
+

 cutlass.set_log_level(logging.WARNING)
 cc = 80
--- a/test/python/cutlass/gemm/gemm_s8_sm90.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm90.py
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with S8 operands on SM90
 """

 from functools import partial
-
-import cutlass
 import logging
 import unittest

-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc

+from utils import LayoutCombination, add_test_gemm
+

 cutlass.set_log_level(logging.WARNING)
 cc = 90
--- a/test/python/cutlass/gemm/gemm_testbed.py
+++ b/test/python/cutlass/gemm/gemm_testbed.py
@ -0,0 +1,387 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from math import prod
+import os
+import re
+import subprocess
+
+import torch
+
+from cutlass import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    LayoutType,
+    OpcodeClass,
+    ShortDataTypeNames,
+    SwizzlingFunctor
+)
+
+from cutlass.backend import compiler
+from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass.backend.memory_manager import get_allocated_size
+from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
+from cutlass.shape import GemmCoord, MatrixCoord
+from cutlass.utils.datatypes import torch_type
+
+
+class GemmUniversalLauncher:
+    def __init__(
+        self,
+        operation,
+        seed=2080,
+        verification=True,
+        iterations=500,
+        compiler_mode= "nvcc",
+        **kwargs,
+    ) -> None:
+        # Create the reduction kernel, if needed
+        self.reduction_operation: ReductionOperation = ReductionOperation(
+            shape=MatrixCoord(4, 32 * operation.C.alignment),
+            C=operation.C,
+            element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+            element_compute=operation.epilogue_functor.element_epilogue,
+            epilogue_functor=operation.epilogue_functor,
+            count=operation.C.alignment,
+        )
+
+        self.math_operation = operation.tile_description.math_instruction.math_operation
+        self.verification = verification
+
+        if compiler_mode == "nvcc":
+            compiler.nvcc()
+        elif compiler_mode == "nvrtc":
+            compiler.nvrtc()
+        else:
+            raise Exception(f"Unexpected compiler string {compiler_mode}")
+
+        op_list = [operation]
+        if operation.arch < 90:
+            # Split K via Python is currently only supported for pre-SM90 kernels
+            op_list.append(self.reduction_operation)
+
+        compiler.add_module(op_list, bypass_cache=False)
+
+        self.operation = operation
+
+        self.dtype_A = torch_type(operation.A.element)
+        self.dtype_B = torch_type(operation.B.element)
+        self.dtype_C = torch_type(operation.C.element)
+        self.dtype_D = torch_type(operation.C.element)
+
+        accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
+        element_size = DataTypeSize[operation.A.element]
+
+        if element_size == 1:
+            self.rand_max = 1
+            self.rand_min = 0
+        elif element_size <= 8:
+            self.rand_max = 1
+            self.rand_min = -1
+        elif element_size == 16:
+            self.rand_max = 4
+            self.rand_min = -4
+        else:
+            self.rand_max = 8
+            self.rand_min = -8
+
+        self.seed = seed
+
+        self.compute_type = operation.epilogue_functor.element_epilogue
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+
+    def print_problem_size(self, p, mode, batch_count):
+        if mode == GemmUniversalMode.Gemm:
+            mode = "Gemm"
+        elif mode == GemmUniversalMode.Batched:
+            mode = "GemmBatched"
+        elif mode == GemmUniversalMode.GemmSplitKParallel:
+            mode = "GemmSplitKParallel"
+        print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
+
+    def uniform_init(self, shape, dtype, layout):
+        size = prod(shape)
+        if dtype.is_floating_point:
+            data = torch.ceil(torch.empty(size=(size,), dtype=dtype, device="cuda").uniform_(self.rand_min - 0.5, self.rand_max - 0.5))
+        else:
+            # PyTorch does not currently support integer-typed matrix multiplications on GPU.
+            # Fall back to CPU for integer type references.
+            data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
+
+        if dtype == torch.float64 or dtype == torch.float32:
+            data = data.to("cpu")
+
+        data_ref = data.reshape(shape)
+
+        if layout == LayoutType.RowMajor:
+            data_cutlass = data_ref
+        else:
+            data_cutlass = data_ref.transpose(-1, -2).contiguous()
+
+        data_cutlass = data_cutlass.to("cuda")
+        return data_cutlass, data_ref
+
+    def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        # If any tensor is on CPU, place all tensors on CPU unless only
+        # tensor C is on CPU
+        devices = [x.device.type for x in [tensor_A, tensor_B, tensor_C]]
+        if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
+            device = torch.device("cpu")
+        else:
+            device = tensor_A.device
+
+        tensor_A = tensor_A.to(device)
+        tensor_B = tensor_B.to(device)
+        tensor_C = tensor_C.to(device)
+
+        dtype = torch_type(self.compute_type)
+        alpha_torch = torch.tensor([alpha], device=device).to(dtype)
+        beta_torch = torch.tensor([beta], device=device).to(dtype)
+
+        tmp = tensor_A @ tensor_B
+        tensor_D_ref = (alpha_torch * tmp) + (tensor_C * beta_torch)
+        return tensor_D_ref.to(self.dtype_D)
+
+    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
+        torch.random.manual_seed(self.seed)
+
+        # Assign an actual batch count in cases where we are not running in batched mode.
+        # This is to differentiate between the number of split K slices and the batch count,
+        # which are overloaded within the single `batch_count` variable.
+        if mode == GemmUniversalMode.Batched:
+            true_batch_count = batch_count
+        else:
+            true_batch_count = 1
+
+        def transpose(layout):
+            if layout == LayoutType.RowMajor:
+                return LayoutType.ColumnMajor
+            else:
+                return LayoutType.RowMajor
+
+        tensor_A, tensor_A_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.k),
+            self.dtype_A,
+            self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
+        )
+        tensor_B, tensor_B_ref = self.uniform_init(
+            (true_batch_count, problem_size.k, problem_size.n),
+            self.dtype_B,
+            self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
+        )
+        tensor_C, tensor_C_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.n),
+            self.dtype_C,
+            self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+        )
+        tensor_D = torch.zeros_like(tensor_C)
+
+        if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+            alpha = int(alpha)
+            beta = int(beta)
+
+        #
+        # Launch kernel
+        #
+
+        arguments = GemmArguments(
+            operation=self.operation,
+            problem_size=problem_size,
+            A=tensor_A,
+            B=tensor_B,
+            C=tensor_C,
+            D=tensor_D,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            gemm_mode=mode,
+            split_k_slices=split_k_slices,
+            batch=batch_count,
+        )
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[problem_size.m, problem_size.n],
+                partitions=split_k_slices,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op=self.reduction_operation.epilogue_type(alpha, beta),
+            )
+
+        self.operation.run(arguments)
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            self.reduction_operation.run(reduction_arguments)
+
+        passed = True
+
+        if self.verification:
+            if mode == GemmUniversalMode.GemmSplitKParallel:
+                reduction_arguments.sync()
+            else:
+                arguments.sync()
+            tensor_D_ref = self.reference(
+                problem_size,
+                tensor_A_ref,
+                tensor_B_ref,
+                tensor_C_ref,
+                alpha,
+                beta,
+            )
+
+            tensor_D_ref = tensor_D_ref.to('cuda')
+
+            if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
+                tensor_D = tensor_D.transpose(-1, -2).contiguous()
+
+            passed = tensor_D.equal(tensor_D_ref)
+
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, mode, batch_count)
+        del arguments
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            del reduction_arguments
+
+        cur_size = get_allocated_size()
+        assert cur_size == 0, f"{cur_size} B of memory were not released after this run"
+
+        return passed
+
+
+def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
+    passed = True
+
+    minimum_operand_element_size = min(
+        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
+    )
+    opcode_class = operation.tile_description.math_instruction.opcode_class
+
+    if opcode_class == OpcodeClass.Simt:
+        alignment = 1
+    else:
+        alignment = 128 // minimum_operand_element_size
+
+    alignment_m = alignment
+    alignment_n = alignment
+    alignment_k = alignment
+
+    # INT8 alignment constraints
+    if opcode_class == OpcodeClass.Simt:
+        A_is_s8 = operation.A.element == DataType.s8
+        B_is_s8 = operation.B.element == DataType.s8
+
+        if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
+            alignment_m = 4
+        if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
+            alignment_n = 4
+        if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
+            alignment_k = 4
+
+    threadblock_k = operation.tile_description.threadblock_shape[2]
+
+    assert testcase != "interleaved"
+
+    supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
+
+    if testcase == "multistage":
+        modes = [GemmUniversalMode.Gemm]
+        problem_size_m = [16, 528]
+        problem_size_n = [16, 528]
+        problem_size_k = [
+            threadblock_k,
+            threadblock_k * operation.tile_description.stages
+            + operation.tile_description.math_instruction.instruction_shape[2],
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1]
+    else:
+        modes = [GemmUniversalMode.Gemm]
+        batch_counts = [1, 2, 3, 5, 7]
+        if supports_split_k:
+            modes.append(GemmUniversalMode.GemmSplitKParallel)
+
+        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
+        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
+        if operation.tile_description.stages is None:
+            stages_for_k_calc = 7
+        else:
+            stages_for_k_calc = operation.tile_description.stages
+        problem_size_k = [
+            alignment_k,
+            threadblock_k * stages_for_k_calc - alignment_k,
+            threadblock_k * stages_for_k_calc * 3 - alignment_k,
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [2.0]
+
+    testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
+
+    for mode in modes:
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for k in problem_size_k:
+                    for batch_count in batch_counts:
+                        for alpha in problem_alpha:
+                            for beta in problem_beta:
+                                # skip very small K problems
+                                if testcase == "universal":
+                                    if k // batch_count < 2 * threadblock_k:
+                                        continue
+
+                                problem_size = GemmCoord(m, n, k)
+
+                                if supports_split_k:
+                                    split_k_slices = batch_count
+                                else:
+                                    split_k_slices = 1
+
+                                overridden_mode = mode
+                                if mode == GemmUniversalMode.Gemm and batch_count > 1:
+                                    overridden_mode = GemmUniversalMode.Batched
+
+                                passed = testbed.run(
+                                    overridden_mode,
+                                    problem_size,
+                                    batch_count,
+                                    split_k_slices,
+                                    alpha,
+                                    beta,
+                                )
+
+                                if not passed:
+                                    return False
+
+    return passed
--- a/test/python/cutlass/gemm/run_all_tests.py
+++ b/test/python/cutlass/gemm/run_all_tests.py
@ -30,12 +30,14 @@
 #
 #################################################################################################

+import pathlib
 import unittest


 if __name__ == '__main__':
    loader = unittest.TestLoader()
-    tests = loader.discover('./', 'conv2d_*.py')
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'gemm_*.py')
    testRunner = unittest.runner.TextTestRunner()
    results = testRunner.run(tests)
    if not results.wasSuccessful():
--- a/test/python/cutlass/gemm/utils.py
+++ b/test/python/cutlass/gemm/utils.py
@ -0,0 +1,239 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import cutlass
+
+from cutlass import (
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames
+)
+from cutlass.backend import library
+from cutlass.backend.utils.software import SubstituteTemplate
+
+from gemm_testbed import test_all_gemm
+
+
+class Layout:
+    """
+    Utility class to map transpose and non-transpose terminology to row- and column-major terminology
+    """
+
+    T = LayoutType.RowMajor
+    N = LayoutType.ColumnMajor
+
+
+class LayoutCombination:
+    """
+    Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
+    """
+
+    NNN = (Layout.N, Layout.N, Layout.N)
+    NNT = (Layout.N, Layout.N, Layout.T)
+    NTN = (Layout.N, Layout.T, Layout.N)
+    NTT = (Layout.N, Layout.T, Layout.T)
+    TNN = (Layout.T, Layout.N, Layout.N)
+    TNT = (Layout.T, Layout.N, Layout.T)
+    TTN = (Layout.T, Layout.T, Layout.N)
+    TTT = (Layout.T, Layout.T, Layout.T)
+
+
+def get_name(
+    layouts,
+    alignments,
+    element_output,
+    element_accumulator,
+    element_epilogue,
+    cluster_shape,
+    threadblock_shape,
+    stages,
+    element_a,
+    element_b,
+    arch,
+    opclass,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    suffix="",
+):
+    """
+    Generates a procedural name for a test case.
+
+    :param layouts: indexable container of layouts of A, B, and C operands
+    :param alignments: indexable container of alignments of A, B, and C operands
+    :param element_output: data type of the output element
+    :param element_accumulator: data type used in accumulation
+    :param element_epilogue: data type used in computing the epilogue
+    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param kernel_schedule: kernel_schedule type
+    :type kernel_schedule: cutlass.KernelScheduleType
+    :param epilogue_schedule: epilogue_schedule type
+    :type epilogue_schedule: cutlass.EpilogueScheduleType
+    :param suffix: additional string to add to the suffix of the name
+    :type suffix: str
+
+    :return: str
+    """
+    name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "eA": DataTypeNames[element_a],
+            "eB": DataTypeNames[element_b],
+            "eC": DataTypeNames[element_output],
+            "lA": ShortLayoutTypeNames[layouts[0]],
+            "lB": ShortLayoutTypeNames[layouts[1]],
+            "lC": ShortLayoutTypeNames[layouts[2]],
+            "opclass": OpcodeClassNames[opclass],
+            "acc": DataTypeNames[element_accumulator],
+            "cM": str(cluster_shape[0]),
+            "cN": str(cluster_shape[1]),
+            "cK": str(cluster_shape[2]),
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "stages": str(stages) if stages is not None else "auto",
+            "aA": str(alignments[0]),
+            "aB": str(alignments[1]),
+            "aC": str(alignments[2]),
+            "k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
+            "e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
+            "suffix": "" if suffix is None else suffix,
+        },
+    )
+
+
+def add_test_gemm(
+    cls=None,
+    cc=None,
+    element=None,
+    layouts=None,
+    alignments=None,
+    element_output=None,
+    element_accumulator=None,
+    cluster_shape=None,
+    threadblock_shape=None,
+    warp_count=None,
+    stages=None,
+    opclass=None,
+    swizzle=None,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    compilation_modes=['nvcc', 'nvrtc']):
+    """
+    Create test-running functions with the given specification and set it as a method of ``cls``.
+
+    :param cls: class to which the generated method will be added
+    :type cls: type
+    :param cc: compute capability to compile for
+    :type cc: int
+    :param element: data type of A and B operands
+    :type element: cutlass.DataType.f16
+    :param layouts: layouts of A, B, and C operands
+    :type layouts: list or tuple
+    :param alignments: alingments of A, B, and C operands
+    :type alignments: list or tuple
+    :param element_output: data type of the output element
+    :type element_output: cutlass.DataType
+    :param element_accumulator: data type used in accumulation
+    :type element_accumulator: cutlass.DataType
+    :param cluster_shape: dimensions of clusters
+    :type cluster_shape: list or tuple
+    :param threadblock_shape: dimensions of threadblock tiles
+    :type threadblock_shape: list or tuple
+    :param warp_count: warps to be launched per threadblock dimension
+    :type warp_count: list or tuple
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param swizzle: threadblock swizzling functor
+    :param kernel_schedule: kernel schedule to use
+    :type kernel_schedule: cutlass.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule to use
+    :type epilogue_schedule: cutlass.EpilogueScheduleType
+    :param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
+    :type compilation_modes: list
+    """
+
+    for compilation_mode in compilation_modes:
+        def run(self):
+            """
+            Dynamically-generated function that constructs a GEMM operation and verifies it against
+            multiple test cases.
+            """
+            element_A = element
+            element_B = element
+            layout_A, layout_B, layout_C = layouts
+            alignment_A, alignment_B, alignment_C = alignments
+
+            plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
+                                element_C=element_output, element_D=element_output,
+                                layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+                                element_accumulator=element_accumulator,
+                                kernel_cc=cc)
+
+            plan.opclass = opclass
+            if swizzle is not None:
+                plan.swizzling_functor = swizzle
+
+            td = plan.tile_descriptions()[0]
+
+            if warp_count is not None:
+                td.warp_count = warp_count
+            td.threadblock_shape = threadblock_shape
+            td.stages = stages
+            td.cluster_shape = cluster_shape
+            op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+            self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
+
+        element_epilogue = element_accumulator
+        name = get_name(
+            layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
+            element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
+            stages=stages, element_a=element, element_b=element, arch=cc, opclass=opclass,
+            kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
+
+        setattr(cls, name, run)
--- a/test/python/cutlass/interface/conv2d_interface.py
+++ b/test/python/cutlass/interface/conv2d_interface.py
@ -38,7 +38,6 @@ from math import ceil
 import unittest

 import cutlass
-import cutlass_bindings
 import cutlass.utils.datatypes as datatypes
 from cutlass.backend.utils.device import device_cc
 from utils import ExpectException
--- a/test/python/cutlass/interface/evt_interface.py
+++ b/test/python/cutlass/interface/evt_interface.py
@ -0,0 +1,245 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Test the EVT interface
+"""
+
+import numpy as np
+import unittest
+
+import cutlass
+from cutlass import LayoutType, Tensor
+from cutlass.backend.utils.device import device_cc
+from cutlass.epilogue import reshape, permute
+
+from utils import ExpectException
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class EVTErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the EVT interface
+    """
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
+    def test_root_not_d(self):
+        """
+        Test when "D" does not exist in Sm90 EVT
+        """
+        def evt_root_not_d(accum, alpha):
+            F = accum * alpha
+            return F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(device_cc() == 90, 
+            "SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
+            "but the variable 'D' is not found in the return values.", True):
+            
+            cutlass.epilogue.trace(evt_root_not_d, example_tensors)
+
+    def test_no_accum(self):
+        """
+        Test when "accum" is not in input arguments
+        """
+        def evt_no_accum(alpha, C):
+            D = alpha * C
+            return D
+        
+        example_tensors = {
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
+            cutlass.epilogue.trace(evt_no_accum, example_tensors)
+    
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
+    def test_too_much_shared_memory(self):
+        """
+        Test when the epilogue consumes too much shared memory
+        """
+        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
+            D1 = accum + C1
+            D2 = D1 + C2
+            D3 = D2 + C3
+            D4 = D3 + C4
+            D = D4 + C5
+            return D, D1, D2, D3, D4
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
+        
+        plan = cutlass.op.Gemm(
+            element=np.float16, layout=cutlass.LayoutType.RowMajor,
+            element_accumulator=np.float32
+        )
+        
+        with ExpectException(True, 
+            "RuntimeError: The epilogue consumes too much shared memory. " 
+            "No valid tile description is found in the generator.", True):
+            plan.epilogue_visitor = epilogue_visitor
+    
+    def test_not_ssa(self):
+        """
+        Test when the epilogue is not in SSA
+        """
+        def evt_redefine(accum, C, alpha):
+            F = accum + C
+            F = F * alpha
+            D = F
+            return D, F
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
+            cutlass.epilogue.trace(evt_redefine, example_tensors)
+
+        def evt_undefine(accum, alpha):
+            F = accum + C
+            D = F * alpha
+            return D, F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
+            cutlass.epilogue.trace(evt_undefine, example_tensors)
+    
+    def test_missing_example_tensor(self):
+        """
+        Test when the example tensor of an input/output variable is not provided
+        """
+        def evt_missing_example_tensor(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+    def test_return_expression(self):
+        """
+        Test when the return value is an expression
+        """
+        def evt_return_expr(accum, C):
+            return accum + C
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
+            cutlass.epilogue.trace(evt_return_expr, example_tensors)
+    
+    def test_incompatible_shape(self):
+        """
+        Test when the shape of example tensors are incompatible
+        """
+        def evt_incompatible_shape(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 256, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, 
+            "RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
+            cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
+    
+    def test_no_matching_impl(self):
+        def evt_no_matching_impl(accum, bias):
+            D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
+            return D
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 256)),
+            "bias": self.fake_tensor(np.float16, (16, 32)),
+            "D": self.fake_tensor(np.float16, (6, 512, 256))
+        }
+        
+        with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
+            cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
+    #
+    # Helper functions
+    #
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/python/cutlass/interface/gemm_interface.py
+++ b/test/python/cutlass/interface/gemm_interface.py
@ -38,7 +38,6 @@ from math import ceil
 import unittest

 import cutlass
-import cutlass_bindings
 import cutlass.utils.datatypes as datatypes
 from cutlass.backend.utils.device import device_cc
 from utils import ExpectException
@ -262,13 +261,13 @@ class GemmErrorTests(unittest.TestCase):

        # Ensure that all tile descriptions have opclass of TensorOp
        for td in plan.tile_descriptions():
-            assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.TensorOp
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp

        plan.opclass = cutlass.OpcodeClass.Simt

        # Ensure that all tile descriptions have opclass of Simt
        for td in plan.tile_descriptions():
-            assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.Simt
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt

    def test_invalid_tile_description(self):
        """
--- a/test/python/cutlass/interface/utils.py
+++ b/test/python/cutlass/interface/utils.py
@ -50,9 +50,10 @@ class ExpectException:
    :param message: message to print if an exception is raised when not expected or vice versa
    :type message: str
    """
-    def __init__(self, exception_expected: bool, message: str = ''):
+    def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
        self.exception_expected = exception_expected
        self.message = message
+        self.verify_msg = verify_msg

    def __enter__(self):
        return self
@ -60,6 +61,9 @@ class ExpectException:
    def __exit__(self, exc_type, exc_val, traceback):
        exception_raised = exc_type is not None
        assert self.exception_expected == exception_raised, self.message
+        if self.verify_msg:
+            exc_message = f"{exc_type.__name__}: {exc_val}"
+            assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"

        # Suppress the exception
        return True
--- a/test/python/pycute/run_all_tests.py
+++ b/test/python/pycute/run_all_tests.py
@ -0,0 +1,75 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility script for discovering and running all PyCuTe tests
+"""
+
+import argparse
+import logging
+import pathlib
+import unittest
+
+
+def numeric_log_level(log_level: str) -> int:
+  """
+  Converts the string identifier of the log level into the numeric identifier used
+  in setting the log level
+
+  :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+  :type x: str
+
+  :return: numeric representation of log level
+  :rtype: int
+  """
+  numeric_level = getattr(logging, log_level.upper(), None)
+  if not isinstance(numeric_level, int):
+    raise ValueError(f"Invalid log level: {log_level}")
+  return numeric_level
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+  args = parser.parse_args()
+
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+
+  loader = unittest.TestLoader()
+  script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+  tests = loader.discover(script_dir, "test_*.py")
+  test_runner = unittest.runner.TextTestRunner()
+  results = test_runner.run(tests)
+  if not results.wasSuccessful():
+    raise Exception("Test cases failed")
--- a/test/python/pycute/test_coalesce.py
+++ b/test/python/pycute/test_coalesce.py
@ -0,0 +1,95 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.coalesce
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestCoalesce(unittest.TestCase):
+  def helper_test_coalesce(self, layout):
+    layoutR = coalesce(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    self.assertEqual(size(layoutR), size(layout))
+
+    for i in range(size(layout)):
+      self.assertEqual(layoutR(i), layout(i))
+
+  def test_coalesce(self):
+    layout = Layout(1,0)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(1,1)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (1,6,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (1,7,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (4,7,8))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,(4,6)))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4), (4,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (24,6,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,3), (2,4,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(((2,2),(2,2)), ((1,4),(8,32)))
+    self.helper_test_coalesce(layout)
+
+
+if __name__ == "__main__":
+  unittest.main()
--- a/test/python/pycute/test_complement.py
+++ b/test/python/pycute/test_complement.py
@ -0,0 +1,92 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.complement
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComplement(unittest.TestCase):
+  def helper_test_complement(self, layout):
+    layoutR = complement(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    # Post-condition: test disjointness of the codomains
+    for a in range(size(layout)):
+      for b in range(size(layoutR)):
+        assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
+
+  def test_complement(self):
+    test = Layout(1,0)
+    self.helper_test_complement(test)
+
+    test = Layout(1,1)
+    self.helper_test_complement(test)
+
+    test = Layout(4,0)
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,3),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,4))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4,8),(8,1,64))
+    self.helper_test_complement(test)
+
+    test = Layout(((2,2),(2,2)),((1,4),(8,32)))
+    self.helper_test_complement(test)
+
+    test = Layout((2,(3,4)),(3,(1,6)))
+    self.helper_test_complement(test)
+
+    test = Layout((4,6),(1,6))
+    self.helper_test_complement(test)
+
+    test = Layout((4,10),(1,10))
+    self.helper_test_complement(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
--- a/test/python/pycute/test_composition.py
+++ b/test/python/pycute/test_composition.py
@ -0,0 +1,204 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.composition
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComposition(unittest.TestCase):
+  def helper_test_composition(self, layoutA, layoutB):
+    layoutR = composition(layoutA, layoutB)
+
+    _LOGGER.debug(f"{layoutA} o {layoutB}  =>  {layoutR}")
+
+    # True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
+
+    # Test that R(c) = A(B(c)) for all coordinates c in layoutR
+    for i in range(size(layoutR)):
+      self.assertEqual(layoutR(i), layoutA(layoutB(i)))
+
+  def test_composition(self):
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((1), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((1), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((2,3), (2,4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8), (8,1))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    layoutB = Layout(8, 4)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((4,2)), ((1,16)))
+    layoutB = Layout((4,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((2,2), (2,1))
+    layoutB = Layout((2,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2))
+    layoutB = Layout((2,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((2,2,2), (1,8,2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((4,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+
+if __name__ == "__main__":
+  unittest.main()
--- a/test/python/pycute/test_int_tuple.py
+++ b/test/python/pycute/test_int_tuple.py
@ -0,0 +1,80 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.int_tuple
+"""
+
+import unittest
+
+from pycute import *
+
+
+class TestIntTuple(unittest.TestCase):
+  def test_product(self):
+    self.assertEqual(product(2), 2)
+
+    self.assertEqual(product((3,2)), 6)
+
+    self.assertEqual(product(product(((2,3),4))), 24)
+
+  def test_inner_product(self):
+    self.assertEqual(inner_product(2, 3), 6)
+
+    self.assertEqual(inner_product((1,2), (3,2)), 7)
+
+    self.assertEqual(inner_product(((2,3),4), ((2,1),2)), 15)
+
+  def test_shape_div(self):
+    self.assertEqual(shape_div((3,4), 6), (1,2))
+
+    self.assertEqual(shape_div((3,4), 12), (1,1))
+
+    self.assertEqual(shape_div((3,4), 36), (1,1))
+
+    self.assertEqual(shape_div(((3,4),6), 36), ((1,1),2))
+
+    self.assertEqual(shape_div((6,(3,4)), 36), (1,(1,2)))
+
+  def test_prefix_product(self):
+    self.assertEqual(prefix_product(2), 1)
+
+    self.assertEqual(prefix_product((3,2)), (1,3))
+
+    self.assertEqual(prefix_product((3,2,4)), (1,3,6))
+
+    self.assertEqual(prefix_product(((2,3),4)), ((1,2),6))
+
+    self.assertEqual(prefix_product(((2,3),(2, 1, 2),( 5,  2,  1))),
+                                    ((1,2),(6,12,12),(24,120,240)))
+
+
--- a/test/python/pycute/test_left_inverse.py
+++ b/test/python/pycute/test_left_inverse.py
@ -0,0 +1,87 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestLeftInverse(unittest.TestCase):
+  def helper_test_left_inverse(self, layout):
+    inv_layout = left_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(layout)):
+      self.assertEqual(inv_layout(layout(i)), i)
+
+  def test_left_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_left_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_left_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
--- a/test/python/pycute/test_right_inverse.py
+++ b/test/python/pycute/test_right_inverse.py
@ -0,0 +1,96 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestRightInverse(unittest.TestCase):
+  def helper_test_right_inverse(self, layout):
+    inv_layout = right_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(inv_layout)):
+      self.assertEqual(layout(inv_layout(i)), i)
+
+  def test_right_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((3,7),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4),(0,2))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_right_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
--- a/test/python/backend/gemm/run_all_tests.py
+++ b/test/python/backend/gemm/run_all_tests.py
@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@ -30,12 +30,30 @@
 #
 #################################################################################################

-import cutlass.backend
+"""
+Unit tests for pycute.typing
+"""
+
+import logging
 import unittest
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestTyping(unittest.TestCase):
+    def helper_test_typing(self, _cls, _obj, cls, expected: bool):
+        _LOGGER.debug(f"issubclass({_cls}, {cls})")
+        _LOGGER.debug(f"isinstance({_obj}, {cls})")
+
+        self.assertEqual(expected, issubclass(_cls, cls))
+        self.assertEqual(expected, isinstance(_obj, cls))
+
+    def test_typing(self):
+        self.helper_test_typing(int, 1, Integer, True)
+        self.helper_test_typing(float, 1., Integer, False)
+        self.helper_test_typing(str, 'hi', Integer, False)
+        self.helper_test_typing(bool, False, Integer, False)

 if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    loader = unittest.TestLoader()
-    tests = loader.discover('./', 'gemm_*.py')
-    testRunner = unittest.runner.TextTestRunner()
-    testRunner.run(tests)
+    unittest.main()