CUTLASS 2.0 (#62)

CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater.
2019-11-19 16:55:34 -08:00
parent b5cab177a9
commit fb335f6a5f
5434 changed files with 599799 additions and 250176 deletions
--- a/test/unit/gemm/threadblock/CMakeLists.txt
+++ b/test/unit/gemm/threadblock/CMakeLists.txt
@ -0,0 +1,33 @@
+# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_threadblock
+  mma_pipelined_wmma_sm70.cu
+  mma_pipelined_wmma_sm75.cu
+  mma_singlestage_wmma_sm70.cu
+  mma_singlestage_wmma_sm75.cu
+  mma_pipelined_sm70.cu
+  mma_pipelined_sm75.cu
+  mma_pipelined_simt.cu
+)
+
--- a/test/unit/gemm/threadblock/batched_gemv.cu
+++ b/test/unit/gemm/threadblock/batched_gemv.cu
@ -0,0 +1,640 @@
+/***************************************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for threadblock level GEMV
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "cutlass/gemm/threadblock/gemv.h"
+#include "cutlass/gemm/threadblock/default_gemv_core.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemv, typename LongIndex, typename RefA, typename RefB, typename RefC>
+__global__ void batched_gemv_threadblock_test_kernel(
+  cutlass::gemm::GemmCoord problem_size,
+  LongIndex stride_a,
+  LongIndex stride_b,
+  LongIndex stride_c,
+  RefA ref_A,
+  RefB ref_B,
+  RefC ref_C
+  ) {
+
+  typename Gemv::IteratorA::TensorCoord threadblock_offset_A(0, 0);
+  typename Gemv::IteratorB::TensorCoord threadblock_offset_B(0, 0);
+  typename Gemv::IteratorB::TensorCoord threadblock_offset_C(0, 0);
+
+  // Move to the right batches for these threads
+  ref_A.add_pointer_offset(threadIdx.y * stride_a);
+  ref_B.add_pointer_offset(threadIdx.y * stride_b);
+  ref_C.add_pointer_offset(threadIdx.y * stride_c);
+
+  // Construct iterators to A and B operands
+  typename Gemv::IteratorA::Params params_A(ref_A.layout());
+  typename Gemv::IteratorA iterator_A(params_A, ref_A.data(), { problem_size.m(), problem_size.k() }, 0, threadblock_offset_A);
+  typename Gemv::IteratorB::Params params_B(ref_B.layout());
+  typename Gemv::IteratorB iterator_B(params_B, ref_B.data(), { problem_size.k(), problem_size.n() }, threadIdx.x, threadblock_offset_B);
+
+  Gemv gemv;
+
+  typename Gemv::FragmentC accum;
+  accum.clear();
+
+  // Compute threadblock-scoped matrix multiply-add
+  gemv(problem_size, accum, iterator_A, iterator_B, accum);
+
+  // IteratorC is PitchLinear<> assumes n() contiguous
+  typename Gemv::IteratorC::Params params_C(ref_C.layout());
+  typename Gemv::IteratorC iterator_C(params_C, ref_C.data(), { problem_size.m(), problem_size.n() }, threadIdx.x, threadblock_offset_C);
+  iterator_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Shape_,
+         typename ElementAB_,
+         typename ElementC_,
+         typename LayoutA_,
+         typename LayoutB_,
+         typename LayoutC_,
+         int LDG_N,
+         int LDG_K,
+         int MAX_THREADS_PER_BLOCK=512,
+         bool DEBUG=false>
+void batched_gemv_threadblock_test(cutlass::gemm::GemmCoord problem_size, int num_batch)
+{
+  using Shape = Shape_;
+  using ElementA = ElementAB_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementAB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ThreadShape = cutlass::gemm::GemmShape<1, LDG_N, LDG_K>;
+
+  using Core = typename cutlass::gemm::threadblock::DefaultGemvCore<
+    Shape,
+    ThreadShape,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC
+  >;
+
+  if (DEBUG)
+  { 
+      num_batch = 1;
+  }
+
+  using Mma = cutlass::gemm::threadblock::Gemv<Core>;
+
+  // Create host tensors that will be the backing store for the batches
+  // Note that no device memory is initially allocated
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A({problem_size.m(), problem_size.k()}, false); 
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B({problem_size.k(), problem_size.n()}, false); 
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed({problem_size.m(), problem_size.n()}, false); 
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference({problem_size.m(), problem_size.n()}, false);
+
+  // Reserve memory for the batch of tensors
+  matrix_A.reserve(problem_size.m()*problem_size.k()*num_batch);
+  matrix_B.reserve(problem_size.n()*problem_size.k()*num_batch);
+  matrix_C_computed.reserve(problem_size.m()*problem_size.n()*num_batch);
+  matrix_C_reference.reserve(problem_size.m()*problem_size.n()*num_batch, false);
+
+  // Fill eatch tensor batch
+  const int seed = 6834;
+  for (int b = 0; b < num_batch; b++)
+  {
+    if(DEBUG)
+    {
+      cutlass::reference::host::BlockFillSequential(
+        matrix_A.host_data_ptr_offset(b*matrix_A.capacity()), matrix_A.capacity());
+      cutlass::reference::host::BlockFillSequential(
+        matrix_B.host_data_ptr_offset(b*matrix_B.capacity()), matrix_B.capacity());
+    }
+    else
+    {
+      cutlass::reference::host::TensorFillRandomUniform(
+        matrix_A.host_view(b*matrix_A.capacity()),
+        seed + 1660,
+        8,
+        -8,
+        0
+      );
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        matrix_B.host_view(b*matrix_B.capacity()),
+        seed + 1880,
+        8,
+        -8,
+        0
+      );
+    }
+
+    cutlass::reference::host::TensorFill(matrix_C_computed.host_view(b*matrix_C_computed.capacity()));
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
+  }
+
+  matrix_A.sync_device();
+  matrix_B.sync_device();
+  matrix_C_computed.sync_device();
+
+  dim3 grid(1, 1);      // only 1 CTA is used
+  dim3 block(Shape::kN / LDG_N, num_batch, 1);
+
+  #if 0
+  printf("block dim = %d x %d\n", block.x, block.y);
+  #endif
+
+  // Some sanity checks
+  EXPECT_TRUE( problem_size.n() % LDG_N == 0 );
+  EXPECT_TRUE( block.x*block.y <= MAX_THREADS_PER_BLOCK );
+
+  test::gemm::threadblock::batched_gemv_threadblock_test_kernel<Mma><<< grid, block >>>(
+    problem_size,
+    matrix_A.capacity(),
+    matrix_B.capacity(),
+    matrix_C_computed.capacity(),
+    matrix_A.device_ref(),
+    matrix_B.device_ref(),
+    matrix_C_computed.device_ref()
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result);
+
+  matrix_C_computed.sync_host();
+
+  // Compute the batched gemms
+  for (int b = 0; b < num_batch; b++)
+  {
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC> reference_gemm;
+
+    reference_gemm(
+      problem_size.mnk(),
+      ElementC(1),
+      matrix_A.host_ref(b*matrix_A.capacity()),
+      matrix_B.host_ref(b*matrix_B.capacity()),
+      ElementC(0),
+      matrix_C_reference.host_ref(b*matrix_C_computed.capacity())
+    );
+
+    bool passed = cutlass::reference::host::TensorEquals(
+                    matrix_C_computed.host_view(b*matrix_C_computed.capacity()), 
+                    matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
+
+    EXPECT_TRUE(passed)
+    //<< "A:\n" << matrix_A.host_view() << "\n"
+    //<< "B:\n" << matrix_B.host_view() << "\n"
+      << "Batch: " << b << "\n"
+      << "Reference:\n" << matrix_C_reference.host_view(b*matrix_C_reference.capacity()) << "\n"
+      << "Computed:\n" << matrix_C_computed.host_view(b*matrix_C_computed.capacity()) << "\n";
+  }
+}
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A: ColumnMajor
+// B: RowMajor
+// C: ColumnMajor
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp32_fp32_2N_2K) {
+  
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 2;
+ 
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape, float, float, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 5x1x128x128_crc_fp32_fp32_4N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 128, 128);
+  const int num_batch = 5;
+  const int LDG_N = 4;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  batched_gemv_threadblock_test<Shape, float, float, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp32_fp32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                float, float, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_2K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 2;
+  
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_fp16_fp32_2N_8K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 8;
+ 
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_fp16_fp32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_crc_i8_i32_2N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                int8_t, int32_t, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_crc_i8_i32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                int8_t, int32_t, 
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+// A: RowMajor
+// B: ColumnMajor
+// C: RowMajor
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp32_fp32_2N_2K) {
+  
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 2;
+ 
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape, float, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcr_fp32_fp32_4N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 128, 128);
+  const int num_batch = 5;
+  const int LDG_N = 4;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  batched_gemv_threadblock_test<Shape, float, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp32_fp32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                float, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_2K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 2;
+  
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_fp16_fp32_2N_8K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 8;
+ 
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_fp16_fp32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcr_i8_i32_2N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                int8_t, int32_t, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcr_i8_i32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                int8_t, int32_t, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+// A: RowMajor
+// B: ColumnMajor
+// C: ColumnMajor
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp32_fp32_2N_2K) {
+  
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 2;
+ 
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape, float, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 5x1x128x128_rcc_fp32_fp32_4N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 128, 128);
+  const int num_batch = 5;
+  const int LDG_N = 4;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  batched_gemv_threadblock_test<Shape, float, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp32_fp32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                float, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_2K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 2;
+  
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_fp16_fp32_2N_8K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 8;
+ 
+  using Shape = cutlass::gemm::GemmShape<1, 64, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_fp16_fp32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                cutlass::half_t, float, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 4x1x64x64_rcc_i8_i32_2N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 64, 64);
+  const int num_batch = 4;
+  const int LDG_N = 2;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 128, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                int8_t, int32_t, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
+
+TEST(SM50_batched_gemv_threadblock, 16x1x17x64_rcc_i8_i32_1N_4K) {
+  using namespace test::gemm::threadblock;
+  cutlass::gemm::GemmCoord problem_size(1, 17, 64);
+  const int num_batch = 16;
+  const int LDG_N = 1;
+  const int LDG_K = 4;
+
+  using Shape = cutlass::gemm::GemmShape<1, 32, LDG_K>;
+  batched_gemv_threadblock_test<Shape,
+                                int8_t, int32_t, 
+                                cutlass::layout::RowMajor,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::ColumnMajor,
+                                LDG_N, LDG_K>(problem_size, num_batch);
+}
--- a/test/unit/gemm/threadblock/epilogue_workspace.cu
+++ b/test/unit/gemm/threadblock/epilogue_workspace.cu
@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/epilogue/epilogue_workspace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel computes accumulator data and stores it out
+template <typename Epilogue>
+__global__ void kernel_epilogue_workspace(typename Epilogue::Params params) {
+
+  __shared__ typename Epilogue::SharedStorage shared_storage;
+
+  int warp_id = threadIdx.y;
+  int lane_id = threadIdx.x;
+
+  Epilogue epilogue(params, shared_storage, warp_id, lane_id);
+
+  //
+  // Initialize accumulator tile
+  //
+  typename Epilogue::FragmentC accum;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Epilogue::FragmentC::kElements; ++i) {
+    accum[i] = Element(warp_id * blockDim.x + lane_id);
+  }
+
+  //
+  // Efficient epilogue
+  //
+
+  cutlass::GemmCoord tb_tile_coord{blockIdx.x, blockIdx.y, 0};
+  
+  cutlass::GemmCoord problem_size = 
+    tb_tile_coord * 
+    cutlass::GemmCoord{Epilogue::Shape::kM, Epilogue::Shape::kN, 1};
+
+  // Store accumulators
+  epilogue(
+    problem_size, 
+    tb_tile_coord, 
+    accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_epilogue_workspace, tensor_op_128x128_64x64) {
+
+  //
+  // Define an instance of the epilogue and see if it works
+  //
+  static int const kWarpCount = 4;
+  static int const kWarpSize = 32;
+
+  using Shape = cutlass::MatrixShape<128, 128>;
+  using FragmentC = cutlass::Array<int, Shape::kCount / (kWarpCount * kWarpSize)>;
+
+  using Epilogue = cutlass::gemm::threadblock::EpilogueWorkspace<
+    Shape,
+    kWarpCount,
+    FragmentC
+  >;
+
+  typename Epilogue::Params params(
+    
+  );
+
+  // Launch the kernel
+  dim3 grid(1,1);
+  dim3 block(kWarpSize, kWarpCount);
+
+  test::gemm::threadblock::kernel_epilogue_workspace<Epilogue><<< grid, block >>>(
+    params
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  EXPECT_EQ(result, cudaSuccess) << "Kernel launch error - " << cudaGetErrorString(result);
+
+  //
+  // 
+  //
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
--- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
@ -0,0 +1,492 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#include "mma_pipelined_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_congruous, tensor_op_64x64x32_64x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_congruous, tensor_op_128x128x32_64x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_congruous, tensor_op_64x64x32_32x32x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_congruous, tensor_op_128x64x32_64x32x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_congruous, tensor_op_128x64x64_64x32x64_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>;
+  using OperatorShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, OperatorShape, ElementA, LayoutA, ElementB,
+      LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_congruous, tensor_op_64x128x32_32x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_congruous, tensor_op_256x128x32_32x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(256, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 8, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x64x32_64x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x128x32_64x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_crosswise, tensor_op_256x128x32_64x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(256, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 8, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x64x32_32x32x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x64x32_64x32x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_crosswise, tensor_op_128x64x64_64x32x64_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>;
+  using OperatorShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, OperatorShape, ElementA, LayoutA, ElementB,
+      LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_crosswise, tensor_op_64x128x32_32x64x32_8x8x4) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassTensorOp>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+#endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED
--- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@ -0,0 +1,334 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ *modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its
+ *contributors may be used to endorse or promote products derived from this
+ *software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
+ *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
+ *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_mma(cutlass::gemm::GemmCoord problem_size,
+                           typename Mma::IteratorA::Params params_A,
+                           typename Mma::IteratorA::TensorRef ref_A,
+                           typename Mma::IteratorB::Params params_B,
+                           typename Mma::IteratorB::TensorRef ref_B,
+                           typename Mma::ElementC *ptr_C, int ldc) {
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+  __shared__ typename Mma::SharedStorage shared_storage;
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = threadIdx.y;
+  int lane_id = threadIdx.x;
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, lane_id);
+
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_id % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_id / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_,
+    /// Number of stages
+    int Stages = 2>
+struct Testbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+  static const int kStages = Stages;
+
+  // Define iterators over tiles from the A operand
+  static const bool use_idp4a = cutlass::platform::is_same<ElementA, int8_t>::value && 
+                                cutlass::platform::is_same<ElementB, int8_t>::value && 
+                                cutlass::platform::is_same<typename MmaCore::OperatorClass, cutlass::arch::OpClassSimt>::value;
+
+  static const bool transposeA =  cutlass::platform::is_same< LayoutA, cutlass::layout::ColumnMajor >::value;
+  static const bool transposeB =  cutlass::platform::is_same< LayoutB, cutlass::layout::RowMajor >::value;
+
+  using IteratorA = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA> ,
+        
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA>
+      >::type;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB> ,
+
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB>
+      >::type;
+
+  // Define MmaPipeline Single Stage
+  using MmaPipelineSingleStage =  cutlass::gemm::threadblock::MmaSingleStage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy>;
+
+  // Define MmaPipeline Two Stages
+  using MmaPipelineTwoStages =  cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy>;
+  
+  // Define the threadblock-scoped pipelined matrix multiply (Select between Single vs. Two stages)
+  using Mma = typename cutlass::platform::conditional<(kStages==1), MmaPipelineSingleStage, MmaPipelineTwoStages>::type;
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed(int m, int n, int k, float alpha_, float beta_)
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+    matrix_C_computed.reset(cutlass::make_Coord(m, n));
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      // TODO: Implement the rest
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      // TODO: Implement the rest
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(matrix_C_computed.host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+    matrix_C_computed.sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    test::gemm::threadblock::kernel_mma<Mma><<<grid, block>>>(
+        problem_size, params_A, matrix_A.device_ref(), params_B,
+        matrix_B.device_ref(), matrix_C_computed.device_data(),
+        matrix_C_computed.layout().stride(0));
+
+    //
+    // Check error code
+    //
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    matrix_C_computed.sync_host();
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   typename MmaCore::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, ElementC(alpha), matrix_A.host_view(),
+        matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed.host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::ofstream output("mma_pipelined_testbed_errors.txt");
+
+      output
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
@ -0,0 +1,760 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+#include "cutlass/arch/wmma.h"
+
+#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
+#include "mma_pipelined_testbed.h"
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+
+/// All tests use double-buffered (kStages=2) mma pipeline for the gemm mainloop
+/// Test name format: SM[arch]_gemm_threadblock_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[dtype].[threadblock_shape]_[warp_shape]
+
+//////////////// [START] Verifying all layouts {N,T}x{N,T}=>{N,T} for WMMA 16x16x16 [START] //////////////////////
+
+///////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
+////////////////////////////////////////////////////////////
+
+// tests for {N,T}x{N,T}=>{T}
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.col.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_row_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_row_f16, 128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 64);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_row_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_row_f16, 128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 96);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.col.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_row_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_row_f16, 128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 96);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+// tests for {N,T}x{N,T}=>{N}
+///////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
+////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_col_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.col.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_col_row_col_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.row.m16n16k16.f16.f16 (wmma native size 16x16x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_row_col_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.col.col.m16n16k16.f16.f16 (wmma native size 16x16x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_col_col_col_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+//////////////// [END] Verifying all layouts {N,T}x{N,T}=>{N,T} for WMMA 16x16x16 [END] //////////////////////
+
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 64);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(256, 256, 96);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m32n8k16.f16.f16 (wmma native size 32x8x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_32x8x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m8n32k16.f16.f16  (wmma native size 8x32x16)
+//////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_8x32x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32   (wmma native size 16x16x16)
+//////////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(256, 256, 96);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+///////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m32n8k16.f32.f32   (wmma native size 32x8x16)
+////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_32x8x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m8n32k16.f32.f32   (wmma native size 8x32x16)
+/////////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_8x32x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+#endif //CUTLASS_ARCH_WMMA_SM70_ENABLED
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
@ -0,0 +1,331 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+#include "cutlass/arch/wmma.h"
+
+#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
+#include "mma_pipelined_testbed.h"
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+
+/// All tests use double-buffered (kStages=2) mma pipeline for the gemm mainloop
+/// Test name format: SM[arch]_gemm_threadblock_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[atype].[threadblock_shape]_[warp_shape]_[instruction_shape]
+
+/////////////////////////////////////////////////////////////////////////
+///       Integer (s8 and u8) WMMA threadblock level tests          /////
+/////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED)
+TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s8, 64x64x32_64x64x32_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s8, 64x64x64_64x64x64_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x32_64x64x32_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape,
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+#endif //CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////
+///      SUBBYTE (s4 and b1) WMMA threadblock level tests          ////
+///////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
+
+TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 128>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_s4, 64x64x64_64x64x64_8x8x32) {
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 2;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_b1, 64x64x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 2; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+#endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
+
+#endif //CUTLASS_ARCH_WMMA_SM75_ENABLED
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
@ -0,0 +1,411 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+#include "cutlass/arch/wmma.h"
+
+#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
+#include "mma_pipelined_testbed.h"
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+
+/// All tests use single staged (kStages=1) mma pipeline for the gemm mainloop
+/// Test name format: SM[arch]_gemm_threadblock_singlestage_wmma_[alayout]_[blayout]_[clayout]_[dtype].[threadblock_shape]_[warp_shape]
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+///       WMMA Floating point (f16 accumulation) - Single stage - Threadblock level tests          ////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 32);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 64);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(256, 256, 96);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m32n8k16.f16.f16 (wmma native size 32x8x16)
+///////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_32x8x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m8n32k16.f16.f16  (wmma native size 8x32x16)
+//////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f16, 64x64x32_64x64x32_8x32x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1;   
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+///       WMMA Floating point (f32 accumulation) - Single stage - Threadblock level tests          ////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32   (wmma native size 16x16x16)
+//////////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1;   
+
+  cutlass::gemm::GemmCoord problem_size(128, 128, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, multicta_256x256x96_128x128x32_64x64x32_16x16x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1;   
+
+  cutlass::gemm::GemmCoord problem_size(256, 256, 96);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(2, 2);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+///////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m32n8k16.f32.f32   (wmma native size 32x8x16)
+////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_32x8x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<32, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+/// wmma.mma.sync.aligned.alayout.blayout.shape.dtype.ctype
+/// wmma.mma.sync.aligned.row.col.m8n32k16.f32.f32   (wmma native size 8x32x16)
+/////////////////////////////////////////////////////////////////////////////////
+TEST(SM70_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_f32, 64x64x32_64x64x32_8x32x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 32, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+#endif //CUTLASS_ARCH_WMMA_SM70_ENABLED
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
@ -0,0 +1,331 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+#include "cutlass/arch/wmma.h"
+
+#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
+#include "mma_pipelined_testbed.h"
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+
+/// All tests use single staged (kStages=1) mma pipeline for the gemm mainloop
+/// Test name format: SM[arch]_gemm_threadblock_singlestage_wmma_tensor_op_[alayout]_[blayout]_[clayout]_[atype].[threadblock_shape]_[warp_shape]_[instruction_shape]
+
+/////////////////////////////////////////////////////////////////////////
+///       Integer (s8 and u8) WMMA threadblock level tests          ////
+/////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED)
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s8, 64x64x32_64x64x32_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s8, 64x64x64_64x64x64_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x32_64x64x32_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x16x16) {
+ 
+  using ElementA = int8_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC,
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+#endif //CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////
+///      SUBBYTE (s4 and b1) WMMA threadblock level tests          ////
+///////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
+
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 128);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 128>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_col_s4, 64x64x64_64x64x64_8x8x32) {
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 64);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::RowMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_col_b1, 64x64x512_64x64x512_8x8x128) {
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = int32_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  static const int kStages = 1; 
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 2048);
+
+  using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
+
+  float alpha = 1.f;
+  float beta = 0.f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadBlockShape, WarpShape, InstructionShape, 
+      ElementA, LayoutA,
+      ElementB, LayoutB, 
+      ElementC, LayoutC, 
+      cutlass::arch::OpClassWmmaTensorOp, kStages,
+      cutlass::arch::OpXorPopc>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore, kStages>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+#endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
+
+#endif //CUTLASS_ARCH_WMMA_SM75_ENABLED