Merge remote-tracking branch 'origin/master' into small_alignment

2021-08-16 07:49:08 -07:00
parent f4b0a33633 a01feb93d9
commit 598e35401c
851 changed files with 33727 additions and 5665 deletions
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 add_subdirectory(unit)
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 include(CTest)
--- a/test/unit/common/cutlass_unit_test.h
+++ b/test/unit/common/cutlass_unit_test.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/CMakeLists.txt
+++ b/test/unit/conv/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 add_custom_target(cutlass_test_unit_conv)
--- a/test/unit/conv/device/CMakeLists.txt
+++ b/test/unit/conv/device/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 list(SORT CUTLASS_NVCC_ARCHS_ENABLED)
@ -141,6 +141,9 @@ cutlass_test_unit_add_executable(
  conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
  conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
  conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+
+  conv2d_fprop_with_broadcast_sm75.cu
+  conv2d_fprop_with_reduction_sm75.cu
 )

 if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
@ -158,15 +161,18 @@ if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
  
  cutlass_test_unit_add_executable(
    cutlass_test_unit_conv_device_tensorop_f32_sm80
-  
+
    conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
    conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
    conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
  
    conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
    conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+
+    # Strided Dgrad 
+    conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
  )
-  
+
  # Conv2d - TF32 input, F32 output, F32 accumulation
  
  cutlass_test_unit_add_executable(
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -71,7 +71,8 @@ TEST(SM50_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_s
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    2,
    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
  >::Kernel;

  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -38,140 +38,6 @@

 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  64x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  128x128_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-
-}

 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
@ -208,52 +74,7 @@ TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_s
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    4,
    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
    cutlass::conv::StrideSupport::kUnity
  >::Kernel;

--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -69,7 +69,8 @@ TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tens
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    3,
    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
  >::Kernel;

  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -66,7 +66,9 @@ TEST(SM70_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tens
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    2,
-    cutlass::arch::OpMultiplyAdd
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
  >::Kernel;

  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -67,7 +67,9 @@ TEST(SM75_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tens
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    2,
-    cutlass::arch::OpMultiplyAdd
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity
  >::Kernel;

  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -36,88 +36,6 @@

 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
-  128x128_32x3_64x64x32) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::half_t;
-  using ElementB           = cutlass::half_t;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, cutlass::layout::TensorNHWC,
-    ElementB, cutlass::layout::TensorNHWC,
-    ElementC, cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassTensorOp,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 32>,
-    cutlass::gemm::GemmShape<64, 64, 32>,
-    cutlass::gemm::GemmShape<16, 8, 16>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      128 / cutlass::sizeof_bits<ElementC>::value,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    3,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic,
-    cutlass::conv::StrideSupport::kStrided
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
-  128x128_32x3_64x64x32) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::half_t;
-  using ElementB           = cutlass::half_t;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, cutlass::layout::TensorNHWC,
-    ElementB, cutlass::layout::TensorNHWC,
-    ElementC, cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassTensorOp,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 32>,
-    cutlass::gemm::GemmShape<64, 64, 32>,
-    cutlass::gemm::GemmShape<16, 8, 16>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      128 / cutlass::sizeof_bits<ElementC>::value,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    3,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic,
-    cutlass::conv::StrideSupport::kUnity
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride,
  128x128_32x3_64x64x32) {
@ -281,6 +199,5 @@ TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_ten
  /// Run all unit test sizes with device-level Conv2d instance
  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
 }
-
 ////////////////////////////////////////////////////////////////////////////////
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -37,95 +37,6 @@

 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  64x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-
-}

 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
@ -162,107 +73,7 @@ TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    4,
    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  test::conv::device::Conv2dProblemVector user_size;
-
-  user_size.push_back(cutlass::conv::Conv2dProblemSize(
-      {1, 8, 8, 4},   // input size  (NHWC)
-      {8, 1, 1, 4},   // filter size (KRSC)
-      {0, 0, 0, 0},                      // padding (pad_h, _, pad_w, _)
-      {1, 1},                            // stride (stride_h, stride_w)
-      {1, 1}                             // dilation (dilation_h, dilation_w) 
-    ));
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(user_size));
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  128x128_8x4_64x32x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 8>,
-    cutlass::gemm::GemmShape<64, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
    cutlass::conv::StrideSupport::kUnity
  >::Kernel;

@ -273,6 +84,7 @@ TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_sim

 }

+
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
  128x128_8x4_64x32x8) {
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -130,93 +130,3 @@ TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_s
 }

 ////////////////////////////////////////////////////////////////////////////////
-TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  64x64_8x2_32x32x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm50,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<32, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
-    2,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM50_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x2_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm50,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
-    2,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kOptimized
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
-
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -37,184 +37,6 @@

 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  64x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  128x128_8x4_64x32x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 8>,
-    cutlass::gemm::GemmShape<64, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kOptimized
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-}
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  128x128_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
  128x128_8x4_64x32x8) {
@ -260,50 +82,6 @@ TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_s

 }

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kOptimized
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}

 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
@ -348,50 +126,6 @@ TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_
  /// Run all unit test sizes with device-level Conv2d instance
  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
 }
-////////////////////////////////////////////////////////////////////////////////
-
-TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  64x64_8x3_64x32x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<64, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    3,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kOptimized
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-}

 ////////////////////////////////////////////////////////////////////////////////
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -37,96 +37,6 @@

 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  64x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
  128x128_8x4_32x64x8) {
@ -167,106 +77,6 @@ TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt

  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;

-  test::conv::device::Conv2dProblemVector user_size;
-
-  user_size.push_back(cutlass::conv::Conv2dProblemSize(
-      {1, 8, 8, 4},   // input size  (NHWC)
-      {8, 1, 1, 4},   // filter size (KRSC)
-      {0, 0, 0, 0},                      // padding (pad_h, _, pad_w, _)
-      {1, 1},                            // stride (stride_h, stride_w)
-      {1, 1}                             // dilation (dilation_h, dilation_w) 
-    ));
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>(user_size));
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  128x128_8x4_64x32x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 8>,
-    cutlass::gemm::GemmShape<64, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kOptimized
-  >::Kernel;
-
-  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
  /// Run all unit test sizes with device-level Conv2d instance
  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());

--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
@ -0,0 +1,221 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32,
+  16x32_8x2_16x16x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::Quaternion<float>;
+  using ElementB           = cutlass::Quaternion<float>;
+  using ElementC           = cutlass::Quaternion<float>;
+  using ElementAccumulator = cutlass::Quaternion<float>;
+  using ElementCompute     = cutlass::Quaternion<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32,
+  16x64_8x2_8x32x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::Quaternion<float>;
+  using ElementB           = cutlass::Quaternion<float>;
+  using ElementC           = cutlass::Quaternion<float>;
+  using ElementAccumulator = cutlass::Quaternion<float>;
+  using ElementCompute     = cutlass::Quaternion<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<16, 64, 8>,
+    cutlass::gemm::GemmShape<8, 32, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32,
+  32x32_8x2_16x16x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::Quaternion<float>;
+  using ElementB           = cutlass::Quaternion<float>;
+  using ElementC           = cutlass::Quaternion<float>;
+  using ElementAccumulator = cutlass::Quaternion<float>;
+  using ElementCompute     = cutlass::Quaternion<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM50_Device_Conv2d_Fprop_Optimized_ImplicitGemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32,
+  16x32_8x2_16x16x8) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::Quaternion<float>;
+  using ElementB           = cutlass::Quaternion<float>;
+  using ElementC           = cutlass::Quaternion<float>;
+  using ElementAccumulator = cutlass::Quaternion<float>;
+  using ElementCompute     = cutlass::Quaternion<float>;
+
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, 
+    cutlass::layout::TensorNHWC,
+    ElementB, 
+    cutlass::layout::TensorNHWC,
+    ElementC, 
+    cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm50,
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    cutlass::gemm::GemmShape<16, 16, 8>, 
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
+
+}
+
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_with_broadcast_testbed.h"
+        
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+TEST(SM75_Device_Conv2d_Fprop_With_Broadcast_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+    cutlass::half_t,
+    float,
+    float,
+    cutlass::half_t,
+    cutlass::half_t,
+    8,
+    cutlass::epilogue::thread::GELU_taylor<float>
+  >;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    EpilogueOutputOp,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2dWithBroadcast<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
@ -0,0 +1,88 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/epilogue/thread/linear_combination_with_elementwise.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_with_reduction_testbed.h"
+        
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+TEST(SM75_Device_Conv2d_Fprop_With_Reduction_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x2_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationWithElementwise<
+    float,
+    float,
+    cutlass::half_t,
+    cutlass::half_t,
+    8
+  >;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithReduction<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    EpilogueOutputOp,
+    cutlass::plus<float>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2dWithReduction<Conv2dFprop>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM75_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/conv/device/conv2d_problems.h
+++ b/test/unit/conv/device/conv2d_problems.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -161,7 +161,7 @@ struct TestbedConv2dProblemSizes {
  void initialize_conv2d_default_sizes() {

    ////////////////////////////////////////////////////////////////////////////////////////////
-    // Very Small input size (1x8x8xminimum_channel_size), filter size (3x3 - 7x7), stride (1,1)
+    // Small input size x stride (1,1)
    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
    ////////////////////////////////////////////////////////////////////////////////////////////
    
@ -229,6 +229,58 @@ struct TestbedConv2dProblemSizes {
      {1, 1}                             // dilation (dilation_h, dilation_w) 
    ));

+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (2,2)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 11, 11, minimum_channel_size},  // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},    // filter size (KRSC)
+      {0, 0, 0, 0},                       // padding (pad_h, _, pad_w, _)
+      {2, 2},                             // stride (stride_h, stride_w)
+      {1, 1}                              // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 11, 11, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},     // filter size (KRSC)
+      {1, 1, 1, 1},                        // padding (pad_h, _, pad_w, _)
+      {2, 2},                              // stride (stride_h, stride_w)
+      {1, 1}                               // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 13, 13, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},     // filter size (KRSC)
+      {1, 1, 1, 1},                        // padding (pad_h, _, pad_w, _)
+      {2, 2},                              // stride (stride_h, stride_w)
+      {1, 1}                               // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 8, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 2, 2, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w) 
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 5, 5, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w) 
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 8, 8, 8},   // input size  (NHWC)
+      {8, 3, 3, 8},   // filter size (KRSC)
+      {0, 0, 0, 0},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w) 
+    ));
+
    ////////////////////////////////////////////////////////////////////////////////////
    // Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1) 
    ////////////////////////////////////////////////////////////////////////////////////
@ -239,7 +291,15 @@ struct TestbedConv2dProblemSizes {
      {1, 1},             // stride (stride_h, stride_w)
      {1, 1}              // dilation (dilation_h, dilation_w) 
    ));
-  
+    
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 19, 37, 160},     // input size  (NHWC)
+      {224, 3, 3, 160},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {2, 2},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
      {1, 16, 16, 160},   // input size  (NHWC)
      {224, 2, 3, 160},   // filter size (KRSC)
@ -284,16 +344,8 @@ struct TestbedConv2dProblemSizes {
    ));

    ////////////////////////////////////////////////////////////////////////////////////
-    // Medium input size (1x16x16x128), filter size (1x1, 3,x3, 5x5), stride (2, 2)  
-    ////////////////////////////////////////////////////////////////////////////////////
-    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
-      {1, 19, 37, 160},     // input size  (NHWC)
-      {224, 3, 3, 160},     // filter size (KRSC)
-      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
-      {2, 2},               // stride (stride_h, stride_w)
-      {1, 1}                // dilation (dilation_h, dilation_w)
-    ));
-  
+    // Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)  
+    //////////////////////////////////////////////////////////////////////////////////// 
    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
      {1, 16, 16, 288},   // input size  (NHWC)
      {160, 5, 5, 288},   // filter size (KRSC)
@ -302,6 +354,61 @@ struct TestbedConv2dProblemSizes {
      {1, 1}              // dilation (dilation_h, dilation_w)
    ));

+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 55, 55, 256},   // input size (NHWC)
+      {512, 1, 1, 256},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 80, 80, 32},    // input size (NHWC)
+      {64, 5, 5, 32},     // filter size (KRSC)
+      {2, 2, 2, 2},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 224, 224, 8},   // input size (NHWC)
+      {64, 7, 7, 8},      // filter size (KRSC)
+      {3, 3, 3, 3},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size stride (3, 3), filter (3, 3), non-default padding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 27, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {0, 0, 0, 0},         // padding (pad_h, _, pad_w, _)
+      {3, 3},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size *mixed* stride (1, 2) and (2, 1), 
+    // filter (3, 3), default padding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 27, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {1, 2},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 27, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {2, 1},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
    /////////////////////////////////////////////////////////////////////////////
    // Additional input size 
    /////////////////////////////////////////////////////////////////////////////
@ -347,15 +454,15 @@ struct TestbedConv2dProblemSizes {

 #if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED                  
  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
-    {1, 124, 224, 96},  // input size  (NHWC)
-    {24, 7, 7, 96},     // filter size (KRSC)
-    {1, 229, 129, 32}   // output size (NPQK)
+    {1, 124, 224, 96},    // input size  (NHWC)
+    {24, 7, 7, 96},       // filter size (KRSC)
+    {1, 229, 129, 32}     // output size (NPQK)
  ));

  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
-    {1, 233, 35, 48},                     // input size  (NHWC)
-    {24, 7, 5, 48},                       // filter size (KRSC)
-    {1, 233, 35, 24}                     // output size (NPQK)
+    {1, 233, 35, 48},     // input size  (NHWC)
+    {24, 7, 5, 48},       // filter size (KRSC)
+    {1, 233, 35, 24}      // output size (NPQK)
  ));

 #endif 
--- a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@ -0,0 +1,187 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+//                           Strided Dgrad (Analytic)
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+
+  test::conv::device::Conv2dProblemVector problem_size_list;
+
+#if 0 // run specific problem size in the unit test first
+    problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 55, 55, 8},   // input size (NHWC)
+      {8, 1, 1, 8},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+#endif
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>(problem_size_list));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x256_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
+  128x256_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kStrided
+  >::Kernel;
+
+  using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dDgradKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dDgrad>());
+}
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
--- a/test/unit/conv/device/conv2d_testbed.h
+++ b/test/unit/conv/device/conv2d_testbed.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -81,7 +81,7 @@ public:
  >;

  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
-
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;

 public:

@ -161,7 +161,7 @@ public:
    initialize_tensor(tensor_A.host_view(), init_A, seed); 
    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
-
+    
    tensor_A.sync_device();
    tensor_B.sync_device();
    tensor_C.sync_device();
@ -214,7 +214,7 @@ public:

 #if 0 //display conv2d problem size for debugging
    std::cout << problem_size << std::endl
-              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
              << std::endl;
 #endif
@ -262,7 +262,7 @@ public:
    if (status != cutlass::Status::kSuccess) {
      return false;
    }
-  
+
    // run conv2d operator
    status = conv2d_op();
    
@ -271,6 +271,7 @@ public:
      return false;
    }

+
    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {

      // configure parallel reduction operator 
@ -280,10 +281,20 @@ public:
        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
        problem_size.split_k_slices,
        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
-        {reinterpret_cast<ElementAccumulator*> (workspace.get()), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {tensor_D_computed.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {tensor_C.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {alpha, beta} 
      );

      status = reduction_op.initialize(reduction_args, nullptr);
@ -302,7 +313,11 @@ public:
      }
    }
    bool passed = false;
-    
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
    tensor_D_computed.sync_host();

 #if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
@ -326,10 +341,6 @@ public:
      alpha, 
      beta);

-    cudaError_t result = cudaDeviceSynchronize();
-    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
-                                   << cudaGetErrorString(result);
-
    // sync host (copy device data to host) for dumping error output in case of mismatches
    tensor_D_reference.sync_host();
    
@ -445,7 +456,7 @@ bool TestAllConv2d(
  Conv2dProblemVector const *problem_vectors[] = {
    &conv_test_sizes,                               // run user specified sizes
    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
-    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+    //&conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
 #if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
 #endif
@ -467,7 +478,7 @@ bool TestAllConv2d(
      // Procedurally disable certain cases
      //
  
-      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} 
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
      if ((ImplicitGemm::kConvolutionalOperator == 
            cutlass::conv::Operator::kDgrad) && 
          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
@ -477,6 +488,18 @@ bool TestAllConv2d(
        }
      }

+      // CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w} 
+      // Although strided dgrad works for all stride combinations, we are only going 
+      // to run strided dgrad for non-unity strides 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+      
      //
      // Test
      //
@ -491,7 +514,7 @@ bool TestAllConv2d(
      if (!passed) {
        return false;
      }
-
+      
      // test mode = convolution
      passed = testbed.run(
        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
@ -503,6 +526,30 @@ bool TestAllConv2d(
    }
  }

+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
+  if ((ImplicitGemm::kConvolutionalOperator == 
+          cutlass::conv::Operator::kDgrad) && 
+      (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+        cutlass::conv::StrideSupport::kStrided)) {
+
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
+    if (!passed) {
+      return false;
+    }
+
+    return passed;
+  }
+
  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
--- a/test/unit/conv/device/conv2d_testbed_interleaved.h
+++ b/test/unit/conv/device/conv2d_testbed_interleaved.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -82,7 +82,7 @@ public:
  >;

  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
-
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;

 public:

@ -245,10 +245,20 @@ public:
        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
        problem_size.split_k_slices,
        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
-        {reinterpret_cast<ElementAccumulator*> (workspace.get()), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {tensor_D_computed.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {tensor_C.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {alpha, beta}
      );

      status = reduction_op.initialize(reduction_args, nullptr);
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -36,51 +36,6 @@
 #include "conv2d_testbed.h"


-////////////////////////////////////////////////////////////////////////////////
-TEST(SM50_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x2_32x32x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm50,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    2,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM50_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
  64x64_8x2_32x32x8) {
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -37,95 +37,6 @@

 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  64x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}

 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
@ -172,96 +83,6 @@ TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_s

 }

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  128x128_8x4_64x32x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 8>,
-    cutlass::gemm::GemmShape<64, 32, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = cutlass::complex<float>;
-  using ElementB           = cutlass::complex<float>;
-  using ElementC           = cutlass::complex<float>;
-  using ElementAccumulator = cutlass::complex<float>;
-  using ElementCompute     = cutlass::complex<float>;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAddComplex,
-    cutlass::conv::IteratorAlgorithm::kOptimized
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32,
  128x128_8x4_64x32x8) {
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -37,151 +37,6 @@

 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  64x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  128x128_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<128, 128, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kAnalytic
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  test::conv::device::Conv2dProblemVector user_size;
-
-  user_size.push_back(cutlass::conv::Conv2dProblemSize(
-      {1, 8, 8, 4},   // input size  (NHWC)
-      {8, 1, 1, 4},   // filter size (KRSC)
-      {0, 0, 0, 0},                      // padding (pad_h, _, pad_w, _)
-      {1, 1},                            // stride (stride_h, stride_w)
-      {1, 1}                             // dilation (dilation_h, dilation_w) 
-    ));
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>(user_size));
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
  128x128_8x4_64x32x8) {
@ -227,51 +82,6 @@ TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt

 }

-////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
-  32x64_8x4_32x64x8) {
-
-  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
-  using ElementA           = float;
-  using ElementB           = float;
-  using ElementC           = float;
-  using ElementAccumulator = float;
-  using ElementCompute     = float;
-
-
-  /// Device-level Conv2d instance
-  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
-    ElementA, 
-    cutlass::layout::TensorNHWC,
-    ElementB, 
-    cutlass::layout::TensorNHWC,
-    ElementC, 
-    cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassSimt,
-    cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<32, 64, 8>,
-    cutlass::gemm::GemmShape<32, 64, 8>, 
-    cutlass::gemm::GemmShape<1, 1, 1>,
-    cutlass::epilogue::thread::LinearCombination<
-      ElementC,
-      1,
-      ElementAccumulator,
-      ElementCompute
-    >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
-    4,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kOptimized
-  >::Kernel;
-
-  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
-
-  /// Run all unit test sizes with device-level Conv2d instance
-  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32,
  128x128_8x4_64x32x8) {
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h
+++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
@ -0,0 +1,551 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d>
+class TestbedConv2dWithBroadcast {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  TestbedConv2dWithBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 3;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Conv2d::ImplicitGemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; 
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_D_computed.sync_host();
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha, 
+      beta);
+
+#endif
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(), 
+      tensor_D_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv2dWithBroadcast(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithBroadcast<ImplicitGemm> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+      
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
+  if ((ImplicitGemm::kConvolutionalOperator == 
+          cutlass::conv::Operator::kDgrad) && 
+      (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+        cutlass::conv::StrideSupport::kStrided)) {
+
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
+    if (!passed) {
+      return false;
+    }
+
+    return passed;
+  }
+
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
--- a/test/unit/conv/device/conv2d_with_reduction_testbed.h
+++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h
@ -0,0 +1,568 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d>
+class TestbedConv2dWithReduction {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  using ElementT = typename EpilogueOutputOp::ElementTensor;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+
+  cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> tensor_Reduction;
+  cutlass::HostTensor<ElementT,           cutlass::layout::RowMajor> tensor_Tensor;
+
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  TestbedConv2dWithReduction(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 3;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    tensor_Reduction.resize({
+      (problem_size.N * problem_size.P * problem_size.Q), 
+      (problem_size.K - 1 + Conv2d::ThreadblockShape::kN) / Conv2d::ThreadblockShape::kN
+    });
+
+    tensor_Tensor.resize({(problem_size.N * problem_size.P * problem_size.Q), problem_size.K});
+
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Conv2d::ImplicitGemmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Reduction.device_data(),
+      tensor_Tensor.device_data(),
+      static_cast<int>(tensor_Reduction.stride()[0]),
+      static_cast<int>(tensor_Tensor.stride()[0])
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; 
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_D_computed.sync_host();
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha, 
+      beta);
+
+#endif
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(), 
+      tensor_D_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv2dWithReduction(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithReduction<ImplicitGemm> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+      
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
+  if ((ImplicitGemm::kConvolutionalOperator == 
+          cutlass::conv::Operator::kDgrad) && 
+      (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == 
+        cutlass::conv::StrideSupport::kStrided)) {
+
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
+    if (!passed) {
+      return false;
+    }
+
+    return passed;
+  }
+
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_problems.h
+++ b/test/unit/conv/device/conv3d_problems.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_testbed.h
+++ b/test/unit/conv/device/conv3d_testbed.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -81,7 +81,8 @@ public:
  >;

  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
-
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
+  
 public:

  /// Initialization
@ -281,10 +282,20 @@ public:
        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
        problem_size.split_k_slices,
        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
-        {reinterpret_cast<ElementAccumulator*> (workspace.get()), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {tensor_D_computed.device_data(), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {tensor_C.device_data(), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)},
-        {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::ImplicitGemmKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {alpha, beta}
      );

      status = reduction_op.initialize(reduction_args, nullptr);
@ -304,6 +315,38 @@ public:
    }
    bool passed = false;

+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_D_computed.sync_host();
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+    
+    cutlass::reference::device::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementAccumulator,
+      ElementCompute
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta
+    );
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else
    cutlass::reference::host::Conv3d<
      ElementA,
      LayoutA,
@ -323,8 +366,7 @@ public:
      alpha,
      beta
    );
-
-    tensor_D_computed.sync_host();
+#endif

    passed = cutlass::reference::host::TensorEquals(
      tensor_D_computed.host_view(), 
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/CMakeLists.txt
+++ b/test/unit/core/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 cutlass_test_unit_add_executable(
--- a/test/unit/core/array.cu
+++ b/test/unit/core/array.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/bfloat16.cu
+++ b/test/unit/core/bfloat16.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/complex.cu
+++ b/test/unit/core/complex.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -32,6 +32,7 @@
 #include "../common/cutlass_unit_test.h"

 #include "cutlass/complex.h"
+#include "cutlass/constants.h"
 #include "cutlass/numeric_conversion.h"

 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -85,6 +86,42 @@ TEST(complex, f16_to_f32_conversion) {

 ////////////////////////////////////////////////////////////////////////////////////////////////////

+TEST(complex, exp_f32) {
+  
+  cutlass::complex<float> Z[] = {
+    {1, 1},
+    {2   ,  cutlass::constants::pi<float>()/2.0f   },
+    {0.5f,  cutlass::constants::pi<float>()        },
+    {0.25f,  cutlass::constants::pi<float>()*3/4.0f },
+    {0, 0},
+  };
+
+  cutlass::complex<double> Expected[] = {
+    {1.4686939399158851, 2.2873552871788423}, 
+    {4.524491950137825e-16, 7.38905609893065},
+    {-1.6487212707001282, 2.019101226849069e-16}, 
+    {-0.9079430793557842, 0.9079430793557843},
+    {1, 0}
+  };
+
+  double tolerance = 0.00001;
+
+  for (int i = 0; cutlass::real(Z[i]); ++i) {
+    double e_r = cutlass::real(Expected[i]);
+    double e_i = cutlass::real(Expected[i]);
+
+    cutlass::complex<float> got = cutlass::exp(Z[i]);
+    float g_r = cutlass::real(got);
+    float g_i = cutlass::real(got);
+
+    EXPECT_TRUE(
+      std::abs(g_r - e_r) < tolerance && std::abs(g_i - e_i) < tolerance
+    ) << "Expected(" << Expected[i] << "), Got(" << got << ")";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 namespace test {

  /// Thorough testing for basic complex math operators. Uses std::complex as a reference.
--- a/test/unit/core/functional.cu
+++ b/test/unit/core/functional.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -29,6 +29,7 @@
 #include "../common/cutlass_unit_test.h"

 #include "cutlass/functional.h"
+#include "cutlass/core_io.h"

 #include "cutlass/layout/matrix.h"
 #include "cutlass/util/host_tensor.h"
@ -78,16 +79,16 @@ __global__ void trinary_operator(

  Operator op;

-  Element a_x = *a;
-  Element b_x = *b;
-  Element c_x = *c;
+  Element a_x = a[blockIdx.x];
+  Element b_x = b[blockIdx.x];
+  Element c_x = c[blockIdx.x];

  CUTLASS_PRAGMA_NO_UNROLL
  for (int i = 0; i < Iterations; ++i) {
    c_x = op(a_x, b_x, c_x);
  }
  
-  *d = c_x;
+  d[blockIdx.x] = c_x;
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -421,3 +422,67 @@ TEST(Functional, multiply_add_bf16x17) {

 /////////////////////////////////////////////////////////////////////////////////////////////////

+template <typename T>
+cutlass::Quaternion<T> random_quaternion(int range) {
+  return cutlass::Quaternion<T>{
+    T((rand() % range * 2) - range),
+    T((rand() % range * 2) - range),
+    T((rand() % range * 2) - range),
+    T((rand() % range * 2) - range)
+  };
+}
+
+template <typename T>
+void Functional_multiply_add_QuaternionT() {
+
+  using Element = cutlass::Quaternion<T>;
+  using Operator = cutlass::multiply_add<Element, Element, Element>;
+  using HostTensor = cutlass::HostTensor<Element, cutlass::layout::RowMajor>;
+
+  int const kM = 128;
+  int const kRange = 8;
+
+  HostTensor A({kM, 1});
+  HostTensor B({kM, 1});
+  HostTensor C({kM, 1});
+  HostTensor D({kM, 1});
+
+  srand(2021);
+
+  for (int m = 0; m < kM; ++m) {
+    A.at({m, 0}) = random_quaternion<T>(kRange);
+    B.at({m, 0}) = random_quaternion<T>(kRange);
+    C.at({m, 0}) = random_quaternion<T>(kRange);
+  }
+
+  A.sync_device();
+  B.sync_device();
+  C.sync_device();
+  D.sync_device();
+
+  test::core::kernel::trinary_operator<Element, Operator><<< dim3(kM,1), dim3(1,1) >>>(
+    D.device_data(),
+    A.device_data(),
+    B.device_data(),
+    C.device_data()
+  );
+
+  D.sync_host();
+  
+  for (int m = 0; m < kM; ++m) {
+
+    Element a = A.at({m, 0});
+    Element b = B.at({m, 0});
+    Element c = C.at({m, 0});
+    Element got = D.at({m, 0});
+    Element expected = a * b + c;
+
+    EXPECT_TRUE(got == expected);
+  }
+}
+
+TEST(Functional, multiply_add_quaternion_f32) {
+  Functional_multiply_add_QuaternionT<float>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/core/half.cu
+++ b/test/unit/core/half.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/matrix.cu
+++ b/test/unit/core/matrix.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -32,6 +32,7 @@
 #include "../common/cutlass_unit_test.h"

 #include "cutlass/matrix.h"
+#include "cutlass/core_io.h"

 /////////////////////////////////////////////////////////////////////////////////////////////////

--- a/test/unit/core/matrix_coord.cu
+++ b/test/unit/core/matrix_coord.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/numeric_conversion.cu
+++ b/test/unit/core/numeric_conversion.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/predicate_vector.cu
+++ b/test/unit/core/predicate_vector.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/quaternion.cu
+++ b/test/unit/core/quaternion.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/tensor_ref.cu
+++ b/test/unit/core/tensor_ref.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/tensor_view.cu
+++ b/test/unit/core/tensor_view.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/test_unit_core.cpp
+++ b/test/unit/core/test_unit_core.cpp
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/core/tfloat32.cu
+++ b/test/unit/core/tfloat32.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/CMakeLists.txt
+++ b/test/unit/epilogue/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 add_subdirectory(thread)
--- a/test/unit/epilogue/thread/CMakeLists.txt
+++ b/test/unit/epilogue/thread/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 cutlass_test_unit_add_executable(
--- a/test/unit/epilogue/thread/linear_combination.cu
+++ b/test/unit/epilogue/thread/linear_combination.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -29,6 +29,8 @@
 #include "../../common/cutlass_unit_test.h"

 #include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/activation.h"

 /////////////////////////////////////////////////////////////////////////////////////////////////

@ -119,3 +121,41 @@ TEST(Epilogue_thread_linear_combination, device_side_f16_f32_ptr) {
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(Epilogue_thread_linear_combination_gelu, device_side_f16_f16_ptr) {
+
+  using Element = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  int const kCount = 8;
+
+  using LinearCombinationGELU = cutlass::epilogue::thread::LinearCombinationGELU<
+    ElementOutput,
+    kCount,
+    Element,
+    Element>;
+
+  Element alpha = Element(1);
+  Element beta = Element(0);
+
+  typename LinearCombinationGELU::Params params(&alpha, &beta);
+
+  LinearCombinationGELU linear_combination_op(params);
+
+  cutlass::Array<Element, kCount> accum;
+
+  for (int i = 0; i < kCount; ++i) {
+    accum[i] = Element((float)i * 0.3f);
+  }
+
+  cutlass::Array<ElementOutput, kCount> destination = linear_combination_op(accum, accum);
+  cutlass::epilogue::thread::GELU<ElementOutput> gelu_func;
+
+  for (int i = 0; i < kCount; ++i) {
+    ElementOutput expected = gelu_func(accum[i]);
+    ElementOutput got = destination[i];
+    EXPECT_TRUE(expected == got);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu
+++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/CMakeLists.txt
+++ b/test/unit/epilogue/threadblock/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 cutlass_test_unit_add_executable(
@ -32,4 +32,5 @@ cutlass_test_unit_add_executable(
  epilogue_volta_tensor_op.cu
  epilogue_wmma_tensor_op_sm70.cu
  epilogue_planar_complex.cu
+  epilogue_with_reduction_tensor_op.cu
 )
--- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
+++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/epilogue_simt.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -32,6 +32,7 @@

 #include "cutlass/aligned_buffer.h"
 #include "cutlass/complex.h"
+#include "cutlass/quaternion.h"

 #include "cutlass/gemm/warp/mma_simt.h"
 #include "cutlass/gemm/warp/mma_simt_policy.h"
@ -1088,4 +1089,80 @@ TEST(SM50_Epilogue_threadblock_epilogue, simt_complex_f64_128x128_32x64x8) {
  EXPECT_TRUE(passed);
 }

-///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Quaternion-valued single-precision
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Epilogue_threadblock_epilogue, simt_quaternion_f32_32x64_32x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using Element = cutlass::Quaternion<float>;
+  using ElementOutput = Element;
+  using ElementAccumulator = Element;
+  using ElementCompute = Element;
+  int const kElementsPerAccess = 1;
+
+  using Shape = cutlass::gemm::GemmShape<32, 64, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+  
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using ElementOutput = Element;
+  using ElementAccumulator = Element;
+  using ElementCompute = Element;
+
+  using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,
+    Element,
+    LayoutA,
+    Element,
+    LayoutB,
+    Element,
+    LayoutC,
+    cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<4, 8>,
+      cutlass::layout::RowMajorInterleaved<2>,
+      cutlass::gemm::GemmShape<2, 2, 1>
+    >
+  >;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
@ -0,0 +1,875 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Unit tests for thread-level GEMM
+*/
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+
+#include "cutlass/epilogue/thread/linear_combination_drelu.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "epilogue_with_reduction_testbed.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Disable selected tests on CUDA 11.1
+//
+//
+#define ENABLE_BLOCKED_TESTS (!(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1))
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f16_tensor_op_64x64_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f32_tensor_op_64x64_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f32_tensor_op_128x128_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 128, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f16_tensor_op_128x128_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 128, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f32_tensor_op_128x64_64x32x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 64, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if ENABLE_BLOCKED_TESTS
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f16_tensor_op_128x64_64x32x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 64, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f32_tensor_op_64x128_32x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<64, 128, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f16_tensor_op_64x128_32x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<64, 128, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f32_tensor_op_128x256_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 256, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f16_tensor_op_128x256_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<128, 256, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f32_tensor_op_256x128_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<256, 128, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Epilogue_with_reduction_threadblock, f16_tensor_op_256x128_64x64x8) {
+
+  //
+  // Define the warp-level matrix multiply
+  //
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  int const kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kPartitionsK = 1;
+  
+  using Shape = cutlass::gemm::GemmShape<256, 128, 8>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Element = cutlass::half_t;
+  using ElementC = ElementAccumulator;
+  using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<Element>::value, 64>;
+  using LayoutC = cutlass::layout::RowMajor;
+
+  using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+      LayoutC>::Type;
+
+  //
+  // Output operator
+  //
+
+  using OutputOp = cutlass::epilogue::thread::LinearCombinationDRelu<
+    ElementAccumulator,
+    ElementAccumulator,
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess
+  >;
+
+  using ReductionOp = cutlass::plus<ElementAccumulator>;
+
+  //
+  // Define the epilogue
+  //
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    kElementsPerAccess
+  >::Epilogue;
+
+  //
+  // Instantiate epilogue
+  //
+
+  EpilogueWithReductionTestbed<Epilogue> testbed;
+
+  bool passed = testbed.run_all();
+
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+    \brief Unit tests for epilogues
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace kernel {
+
+template <typename Epilogue>
+__global__ void epilogue_with_reduction_threadblock(
+  typename Epilogue::ElementVector *ptr_Reduction,
+  typename Epilogue::OutputTileIterator::Params params_D,
+  typename Epilogue::OutputTileIterator::Element *ptr_D,
+  typename Epilogue::OutputTileIterator::Params params_C,
+  typename Epilogue::OutputTileIterator::Element *ptr_C,
+  typename Epilogue::TensorTileIterator::Params params_Tensor,
+  typename Epilogue::TensorTileIterator::Element *ptr_Tensor,
+  typename Epilogue::OutputOp::Params params_output_op,
+  cutlass::MatrixCoord problem_size,
+  cutlass::TensorRef<
+    typename Epilogue::WarpMmaOperator::ElementC, 
+    typename Epilogue::WarpMmaOperator::LayoutC> accumulator_ref,
+  int epilogue_count = 1) {
+
+  __shared__ typename Epilogue::SharedStorage shared_storage;
+
+  int thread_idx = threadIdx.x;
+  int warp_idx = threadIdx.x / 32;
+  int lane_idx = threadIdx.x % 32;
+
+  //
+  // Construct the epilogue
+  //
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_D(
+    params_D,
+    ptr_D,
+    problem_size,
+    thread_idx
+  );
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_C(
+    params_C,
+    ptr_C,
+    problem_size,
+    thread_idx
+  );
+
+  // Tile iterator writing to output tile
+  typename Epilogue::TensorTileIterator iterator_T(
+    params_Tensor,
+    ptr_Tensor,
+    problem_size,
+    thread_idx
+  );
+
+  // Epilogue operator
+  Epilogue epilogue(
+    shared_storage, 
+    thread_idx, 
+    warp_idx, 
+    lane_idx);
+
+  //
+  // Initialize the accumulators
+  //
+
+  int warp_mn = warp_idx % (Epilogue::WarpCount::kM * Epilogue::WarpCount::kN);
+  int warp_m = warp_mn % Epilogue::WarpCount::kM;
+  int warp_n = warp_mn / Epilogue::WarpCount::kM;
+
+  accumulator_ref.add_coord_offset({
+    warp_m * Epilogue::WarpMmaOperator::Shape::kM, 
+    warp_n * Epilogue::WarpMmaOperator::Shape::kN});
+
+  typename Epilogue::WarpMmaOperator::IteratorC accumulator_iterator(accumulator_ref, lane_idx);
+  
+  typename Epilogue::AccumulatorTile accumulators;
+
+  accumulators.clear();
+  accumulator_iterator.load(accumulators);
+
+#if 0
+  // For debugging, enable this block of code to fill each accumulator element with its
+  // source thread ID.
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < accumulators.size(); ++i) {
+    typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);
+    //typename Epilogue::WarpMmaOperator::ElementC x(i);
+    accumulators[i] = x;
+  }
+
+  /*
+  #pragma unroll 1
+  for (int tid = 0; tid < 32; ++tid) {
+    if (tid == thread_idx) {
+      printf("\nT%d: ", thread_idx);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < accumulators.size(); ++i) {
+        printf("%d ", int(accumulators[i]));
+      }  
+    }
+  }
+
+  if (thread_idx == 0) {
+    printf("\n\n");  
+  }
+  */
+
+  __syncthreads();
+
+#endif
+
+  //
+  // Perform the epilogue operation
+  //
+
+  typename Epilogue::OutputOp output_op(params_output_op);
+
+  // Place the epilogue in a loop
+  for (int iter = 0; iter < epilogue_count; ++iter) {
+    epilogue(output_op, ptr_Reduction, iterator_D, accumulators, iterator_C, iterator_T);
+  }
+}
+
+} // namespace kernel
+} // namespace test
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Epilogue_
+>
+class EpilogueWithReductionTestbed {
+public:
+
+  using Epilogue = Epilogue_;
+  using ElementAccumulator = typename Epilogue::ElementAccumulator;
+  using ElementCompute = typename Epilogue::OutputOp::ElementCompute;
+  using ElementTensor = typename Epilogue::TensorTileIterator::Element;
+  using ElementOutput = typename Epilogue::ElementOutput;
+  using OutputOpParams = typename Epilogue::OutputOp::Params;
+
+public:
+
+  //
+  // Data members
+  //
+
+  cutlass::MatrixCoord quantized_size;
+  cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> accumulator_tensor;
+  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> source_tensor;
+  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> output_tensor;
+  cutlass::HostTensor<ElementTensor, cutlass::layout::RowMajor> additional_tensor;
+  cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> reduction_tensor;
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  EpilogueWithReductionTestbed(): 
+    quantized_size(Epilogue::Shape::kM, Epilogue::Shape::kN),
+    accumulator_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    source_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    output_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    additional_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    reduction_tensor({1, Epilogue::Shape::kN}) {
+
+    //
+    // Initialize problem space
+    //
+
+    uint64_t seed = 2019;
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      accumulator_tensor.host_view(), 
+      seed, 
+      20, 
+      -20, 
+      0);
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      source_tensor.host_view(),
+      seed + 2018, 
+      20, 
+      -20, 
+      0);
+
+    cutlass::reference::host::TensorFill(additional_tensor.host_view(), ElementTensor(1));
+  }
+
+  bool run_all() {
+   
+    /*
+    double alpha_values[] = {1, 0, 2.25};
+    double beta_values[] = {0, 1, -1.25};
+
+    // Test runtime explodes if we tried to test every case exhaustively. This tests the full
+    // output tile and several smaller sizes to stress predication.
+    for (int m_idx = 0; m_idx < 3; ++m_idx) {
+      for (int n_idx = 0; n_idx < 3; ++n_idx) {
+
+        int m = quantized_size.row() - m_idx * 3;
+        int n = quantized_size.column() - n_idx * Epilogue::kElementsPerAccess;
+
+        for (double const &alpha : alpha_values) {
+          for (double const &beta : beta_values) {
+
+            bool passed = run({m, n}, {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});
+
+            if (!passed) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+    return true;
+    */
+
+    double alpha = 1;
+    double beta = 0;
+
+    return run(
+      {quantized_size.row(), quantized_size.column()},
+      {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});
+  }
+
+  /// Runs the test
+  bool run(
+    cutlass::MatrixCoord problem_size,
+    OutputOpParams output_params) { 
+
+    //
+    // Initialize problem space
+    //
+
+    ElementOutput default_output = ElementOutput(-127);
+    ElementAccumulator default_reduction = ElementAccumulator();
+
+    cutlass::reference::host::TensorFill(output_tensor.host_view(), default_output);
+    cutlass::reference::host::TensorFill(reduction_tensor.host_view(), default_reduction);
+
+    accumulator_tensor.sync_device();
+    output_tensor.sync_device();
+    source_tensor.sync_device();
+    additional_tensor.sync_device();
+    reduction_tensor.sync_device();
+
+    //
+    // Initialize epilogue parameters
+    //
+
+    typename Epilogue::OutputTileIterator::Params params_D(output_tensor.device_ref().layout());
+    typename Epilogue::OutputTileIterator::Params params_C(source_tensor.device_ref().layout());
+    typename Epilogue::TensorTileIterator::Params params_T(additional_tensor.device_ref().layout());
+
+    //
+    // Launch kernel
+    //
+
+    dim3 grid(1, 1);
+    dim3 block(Epilogue::WarpCount::kCount * 32, 1);
+
+    test::kernel::epilogue_with_reduction_threadblock<Epilogue><<< grid, block >>>(
+      reduction_tensor.device_data(),
+      params_D,
+      output_tensor.device_data(),
+      params_C,
+      source_tensor.device_data(),
+      params_T,
+      additional_tensor.device_data(),
+      output_params,
+      problem_size, 
+      accumulator_tensor.device_view());
+
+    cudaError_t result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Kernel error: " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    //
+    // Verify results
+    //
+    output_tensor.sync_host();
+    reduction_tensor.sync_host();
+
+    int errors = 0;
+    int const kMaxErrors = 5;
+
+    //
+    // The output has two parts:
+    //  - GEMM tensor epilogue in canonical layout
+    //  - partial reduction in canonical row-major layout
+    //
+
+    // Verify the GEMM tensor output
+    for (int r = 0; errors < kMaxErrors && r < quantized_size.row(); ++r) {
+      for (int c = 0; errors < kMaxErrors && c < quantized_size.column(); ++c) {
+
+        cutlass::MatrixCoord coord{r, c};
+        ElementOutput got = output_tensor.at(coord);
+        
+        ElementOutput expected;
+        if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {
+
+          expected = ElementOutput(output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) + 
+            output_params.beta * ElementCompute(source_tensor.at(coord)));
+        }
+        else {
+          expected = default_output;
+        }
+
+        if (expected != got) {
+
+          using OutputIO = cutlass::ScalarIO<ElementOutput>;
+
+          EXPECT_TRUE(false)
+            << "-------\n"
+            << "Error - output element (" << coord << ") - expected: " 
+            << OutputIO(expected) 
+            << ",  got: " << OutputIO(got) << std::endl;
+
+          ++errors;
+        }
+      }
+    }
+
+    // Verify the partial reduction
+    for (int c = 0; c < quantized_size.column(); ++c) {
+
+      ElementAccumulator reduction_acc = ElementAccumulator();
+
+      for (int r = 0; r < quantized_size.row(); ++r) {
+        reduction_acc += accumulator_tensor.at({r, c});
+      }
+
+      ElementAccumulator expected = default_reduction;
+      ElementAccumulator got = reduction_tensor.at({0, c});
+
+      if (c < problem_size.column()) {
+        expected = reduction_acc;
+      }
+      else {
+        expected = default_reduction;
+      }
+
+      if (expected != got) {
+        
+        using OutputIO = cutlass::ScalarIO<ElementAccumulator>;
+
+        EXPECT_TRUE(false)
+          << "-------\n"
+          << "Error - reduction element (" << c << ") - expected: " 
+          << OutputIO(expected) 
+          << ", got: " << OutputIO(got) << std::endl;
+      }
+    }
+
+    //
+    // Report results on error
+    //
+
+    if (errors) {
+      std::stringstream ss;
+      ss 
+        << "output_tensor_op_" << Epilogue::Shape::kM << "x" << Epilogue::Shape::kN << "_" 
+        << Epilogue::WarpTileIterator::WarpShape::kM << "x" 
+        << Epilogue::WarpTileIterator::WarpShape::kN 
+        << "_slice_" << Epilogue::WarpCount::kK << ".csv"; 
+
+      std::ofstream output_file(ss.str()); 
+      output_file << output_tensor.host_view(); 
+    }
+
+    return !errors;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
+++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu
+++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -63,7 +63,7 @@ struct OutputTileThreadMapExpr {
  };

  int const kWarpSize = 32;
-  int const kMemoryAccessSize = 128;  // size in bytes of the preferred memory access size
+  int const kMemoryAccessSize = 256;  // size in bytes of the preferred memory access size

  //
  // Data members
--- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/threadblock/testbed.h
+++ b/test/unit/epilogue/threadblock/testbed.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
@ -28,13 +28,14 @@
 #pragma once

 #include <fstream>
+#include <cfenv>

 #include "../../common/cutlass_unit_test.h"

 #include "cutlass/aligned_buffer.h"
 #include "cutlass/half.h"
 #include "cutlass/complex.h"
-
+#include "cutlass/quaternion.h"
 #include "cutlass/epilogue/thread/linear_combination.h"

 #include "cutlass/util/host_tensor.h"
@ -307,10 +308,18 @@ public:
        
        ElementOutput expected;
        if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {
-          expected = ElementOutput(output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) + 
-            output_params.beta * ElementCompute(source_tensor.at(coord)));
-        }
-        else {
+          ElementCompute intermediate =
+            output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) + 
+            output_params.beta * ElementCompute(source_tensor.at(coord));
+          
+          if (std::numeric_limits<ElementOutput>::is_integer
+              && !std::numeric_limits<ElementCompute>::is_integer) {
+            std::fesetround(FE_TONEAREST);
+            expected = ElementOutput(std::nearbyint(float(cutlass::real(intermediate))));
+          } else {
+            expected = ElementOutput(intermediate);
+          }
+        } else {
          expected = default_output;
        }

@ -322,7 +331,11 @@ public:
            << "-------\n"
            << "Error - output element (" << coord << ") - expected: " 
            << OutputIO(expected) 
-            << ",  got: " << OutputIO(got) << std::endl;
+            << ",  got: " << OutputIO(got)
+            << ",  accum: " << (accumulator_tensor.at(coord))
+            << ",  source: " << OutputIO(source_tensor.at(coord))
+            << ",  alpha: " << (output_params.alpha)
+            << ",  beta: " << (output_params.beta) << "\n";

          ++errors;
        }
--- a/test/unit/epilogue/threadblock/testbed_planar_complex.h
+++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/warp/CMakeLists.txt
+++ b/test/unit/epilogue/warp/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 cutlass_test_unit_add_executable(
--- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/test/unit/gemm/CMakeLists.txt
+++ b/test/unit/gemm/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 add_subdirectory(thread)
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@ -17,7 +17,7 @@
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 add_custom_target(
@ -34,6 +34,7 @@ add_custom_target(
  cutlass_test_unit_gemm_device_wmma
  cutlass_test_unit_gemm_device_tensorop_planar_complex
  cutlass_test_unit_gemm_device_sparse_tensorop_sm80
+  cutlass_test_unit_gemv_device
 )

 add_custom_target(
@ -50,6 +51,7 @@ add_custom_target(
  test_unit_gemm_device_wmma
  test_unit_gemm_device_tensorop_planar_complex
  test_unit_gemm_device_sparse_tensorop_sm80
+  test_unit_gemv_device
 )

 cutlass_test_unit_add_executable(
@ -66,6 +68,11 @@ cutlass_test_unit_add_executable(
  simt_cgemm_tn_sm50.cu
  simt_cgemm_tt_sm50.cu

+  simt_qgemm_nn_sm50.cu
+  simt_qgemm_nt_sm50.cu
+  simt_qgemm_tn_sm50.cu
+  simt_qgemm_tt_sm50.cu
+
  simt_dgemm_nn_sm50.cu
  simt_dgemm_nt_sm50.cu
  simt_dgemm_tn_sm50.cu
@ -203,6 +210,7 @@ cutlass_test_unit_add_executable(

  gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
  gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
+
 )

 cutlass_test_unit_add_executable(
@ -332,3 +340,36 @@ cutlass_test_unit_add_executable(
  gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
 )

+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemv_device
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemv.cu
+)
+
+if (NOT CUDA_COMPILER MATCHES "[Cc]lang")
+
+add_dependencies(
+  cutlass_test_unit_gemm_device
+  cutlass_test_unit_gemm_device_gemm_with_fused_epilogue_tensorop
+  )
+
+add_dependencies(
+  test_unit_gemm_device
+  test_unit_gemm_device_gemm_with_fused_epilogue_tensorop
+  )
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_gemm_with_fused_epilogue_tensorop
+  
+  gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
+  gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
+
+  gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
+)
+
+endif()
+
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
@ -18,7 +18,7 @@
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
--- a/Show More
+++ b/Show More