CUTLASS 2.6 (#298)

CUTLASS 2.6
2021-07-22 21:40:53 -07:00
parent 6c29fe20ba
commit e5d51840e8
308 changed files with 32408 additions and 4722 deletions
--- a/examples/01_cutlass_utilities/cutlass_utilities.cu
+++ b/examples/01_cutlass_utilities/cutlass_utilities.cu
@ -119,12 +119,12 @@ cudaError_t cutlass_hgemm_nn(
  int K,
  cutlass::half_t alpha,
  cutlass::half_t const *A,
-  int lda,
+  cutlass::layout::ColumnMajor::Stride::Index lda,
  cutlass::half_t const *B,
-  int ldb,
+  cutlass::layout::ColumnMajor::Stride::Index ldb,
  cutlass::half_t beta,
  cutlass::half_t *C,
-  int ldc) {
+  cutlass::layout::ColumnMajor::Stride::Index ldc) {

  // Define the GEMM operation
  using Gemm = cutlass::gemm::device::Gemm<
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@ -67,7 +67,7 @@ beta * C).
 Now that we setup the properties of data, we have to setup properties of computation.

 Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
-64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
+64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
 deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
 bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
 high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@ -275,10 +275,10 @@ public:
    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;

-    int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
-    int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
-    int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-    int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);

    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@ -292,10 +292,11 @@ public:
    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;

-    int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
-    int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
-    int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
-    int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+

    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
--- a/examples/13_two_tensor_op_fusion/README.md
+++ b/examples/13_two_tensor_op_fusion/README.md
@ -48,6 +48,10 @@ addition to its own input activation tile. Therefore the input activation warp t
 2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
 register file, and the operation can be fully register-file-resident.

+When applying the above constraint to convolutions, it is required that the 2nd Convolution
+kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
+threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.
+
 # Copyright

 Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
@ -36,8 +36,6 @@
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_conv2d_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
    {128, 56, 56, 64}     // output size (NPQK)
  );

-void run_nonfused_conv2d_fprop_f16_sm75() {
+bool run_nonfused_conv2d_fprop_f16_sm75() {

  using ElementA           = cutlass::half_t;
  using ElementB           = cutlass::half_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_f16_sm75() {
      ElementC,
      128 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    2,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_f16_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_f16_sm75() {
+bool run_fused_conv2d_fprop_f16_sm75() {

  using ElementA           = cutlass::half_t; 
  using ElementB           = cutlass::half_t; 
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_f16_sm75() {
      ElementC,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_f16_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
+bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {

  using ElementA           = cutlass::half_t;
  using ElementB           = cutlass::half_t;
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
      ElementC,
      128 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    2,
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_optimized_f16_sm75() {
+bool run_fused_conv2d_fprop_optimized_f16_sm75() {

  using ElementA           = cutlass::half_t; 
  using ElementB           = cutlass::half_t; 
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm75() {
      ElementC,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -359,10 +364,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }


 ////////////////////////////////////////////////////////////////////////////////
-
-#endif  // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
@ -36,8 +36,6 @@
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_conv2d_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
    {128, 56, 56, 64}     // output size (NPQK)
  );

-void run_nonfused_conv2d_fprop_f16_sm80() {
+bool run_nonfused_conv2d_fprop_f16_sm80() {

  using ElementA           = cutlass::half_t;
  using ElementB           = cutlass::half_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_f16_sm80() {
      ElementC,
      128 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    3,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_f16_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_f16_sm80() {
+bool run_fused_conv2d_fprop_f16_sm80() {

  using ElementA           = cutlass::half_t; 
  using ElementB           = cutlass::half_t; 
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_f16_sm80() {
      ElementC,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -205,9 +206,10 @@ void run_fused_conv2d_fprop_f16_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
+bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {

  using ElementA           = cutlass::half_t;
  using ElementB           = cutlass::half_t;
@ -240,7 +242,8 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
      ElementC,
      128 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    3,
@ -285,9 +288,10 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_optimized_f16_sm80() {
+bool run_fused_conv2d_fprop_optimized_f16_sm80() {

  using ElementA           = cutlass::half_t; 
  using ElementB           = cutlass::half_t; 
@ -311,7 +315,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm80() {
      ElementC,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -355,9 +360,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

 ////////////////////////////////////////////////////////////////////////////////

-#endif  // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
@ -36,8 +36,6 @@
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_interleaved_conv2d_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
    {128, 56, 56, 64}     // output size (NPQK)
  );

-void run_nonfused_conv2d_fprop_s8_sm75() {
+bool run_nonfused_conv2d_fprop_s8_sm75() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_s8_sm75() {
      ElementC,
      64 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    2,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_s8_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_s8_sm75() {
+bool run_fused_conv2d_fprop_s8_sm75() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_s8_sm75() {
      ElementC,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_s8_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
+bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
      ElementC,
      64 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    2,
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_optimized_s8_sm75() {
+bool run_fused_conv2d_fprop_optimized_s8_sm75() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm75() {
      ElementC,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -359,9 +364,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm75() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

 ////////////////////////////////////////////////////////////////////////////////

-#endif  // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
@ -36,8 +36,6 @@
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_interleaved_conv2d_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
    {128, 56, 56, 64}     // output size (NPQK)
  );

-void run_nonfused_conv2d_fprop_s8_sm80() {
+bool run_nonfused_conv2d_fprop_s8_sm80() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_s8_sm80() {
      ElementC,
      64 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    3,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_s8_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_s8_sm80() {
+bool run_fused_conv2d_fprop_s8_sm80() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_s8_sm80() {
      ElementC,
      8 * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_s8_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
+bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
      ElementC,
      64 / cutlass::sizeof_bits<ElementC>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    3,
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }

-void run_fused_conv2d_fprop_optimized_s8_sm80() {
+bool run_fused_conv2d_fprop_optimized_s8_sm80() {

  using ElementA           = int8_t;
  using ElementB           = int8_t;
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm80() {
      ElementC,
      8 * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -359,10 +364,9 @@ void run_fused_conv2d_fprop_optimized_s8_sm80() {
  else
    std::cout << "Fail\n";

+  return pass;
 }


 ////////////////////////////////////////////////////////////////////////////////

-#endif  // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
@ -39,14 +39,12 @@
 #include "device/b2b_gemm.h"
 #include "b2b_gemm_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576);
 cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64);

-void run_nonfused_gemm_f16() {
+bool run_nonfused_gemm_f16() {

  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_f16() {
      ElementOutput,
      128 / cutlass::sizeof_bits<ElementOutput>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    2
@ -116,9 +115,11 @@ void run_nonfused_gemm_f16() {
    std::cout << "Pass\n";
  else
    std::cout << "Fail\n";
+
+  return pass;
 }

-void run_fused_gemm_f16() {
+bool run_fused_gemm_f16() {

  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
@ -140,7 +141,8 @@ void run_fused_gemm_f16() {
      ElementOutput,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -183,7 +185,6 @@ void run_fused_gemm_f16() {
  else
    std::cout << "Fail\n";

+  return passed;
 }
 ////////////////////////////////////////////////////////////////////////////////
-
-#endif  //#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
@ -39,14 +39,12 @@
 #include "device/b2b_gemm.h"
 #include "b2b_gemm_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576);
 cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64);

-void run_nonfused_gemm_f16_sm80() {
+bool run_nonfused_gemm_f16_sm80() {

  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_f16_sm80() {
      ElementOutput,
      128 / cutlass::sizeof_bits<ElementOutput>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    3
@ -116,9 +115,11 @@ void run_nonfused_gemm_f16_sm80() {
    std::cout << "Pass\n";
  else
    std::cout << "Fail\n";
+
+  return pass;
 }

-void run_fused_gemm_f16_sm80() {
+bool run_fused_gemm_f16_sm80() {

  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
@ -140,7 +141,8 @@ void run_fused_gemm_f16_sm80() {
      ElementOutput,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -183,7 +185,7 @@ void run_fused_gemm_f16_sm80() {
  else
    std::cout << "Fail\n";

+  return passed;
+
 }
 ////////////////////////////////////////////////////////////////////////////////
-
-#endif  //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
@ -39,14 +39,12 @@
 #include "device/b2b_gemm.h"
 #include "b2b_interleaved_gemm_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576);
 cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64);

-void run_nonfused_gemm_s8() {
+bool run_nonfused_gemm_s8() {

  using ElementOutput = int8_t;
  using ElementAccumulator = int32_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_s8() {
      ElementOutput,
      64 / cutlass::sizeof_bits<ElementOutput>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
    2
@ -116,9 +115,11 @@ void run_nonfused_gemm_s8() {
    std::cout << "Pass\n";
  else
    std::cout << "Fail\n";
+
+  return pass;
 }

-void run_fused_gemm_s8() {
+bool run_fused_gemm_s8() {

  using ElementOutput = int8_t;
  using ElementAccumulator = int32_t;
@ -140,7 +141,8 @@ void run_fused_gemm_s8() {
      ElementOutput,
      InstructionShape::kM * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -151,8 +153,6 @@ void run_fused_gemm_s8() {
      ElementCompute
    >;

-
-
  using B2bGemm = cutlass::gemm::device::B2bGemm<
    int8_t,
    cutlass::layout::ColumnMajorInterleaved<32>,
@ -183,7 +183,7 @@ void run_fused_gemm_s8() {
  else
    std::cout << "Fail\n";

+  return passed;
+
 }
 ////////////////////////////////////////////////////////////////////////////////
-
-#endif  // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
@ -39,14 +39,12 @@
 #include "device/b2b_gemm.h"
 #include "b2b_interleaved_gemm_run.h"

-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////

 cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576);
 cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64);

-void run_nonfused_gemm_s8_sm80() {
+bool run_nonfused_gemm_s8_sm80() {

  using ElementOutput = int8_t;
  using ElementAccumulator = int32_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_s8_sm80() {
      ElementOutput,
      64 / cutlass::sizeof_bits<ElementOutput>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    3,
@ -106,7 +105,8 @@ void run_nonfused_gemm_s8_sm80() {
      ElementOutput,
      64 / cutlass::sizeof_bits<ElementOutput>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >,
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
    3,
@ -124,9 +124,11 @@ void run_nonfused_gemm_s8_sm80() {
    std::cout << "Pass\n";
  else
    std::cout << "Fail\n";
+
+  return pass;
 }

-void run_fused_gemm_s8_sm80() {
+bool run_fused_gemm_s8_sm80() {

  using ElementOutput = int8_t;
  using ElementAccumulator = int32_t;
@ -148,7 +150,8 @@ void run_fused_gemm_s8_sm80() {
      ElementOutput,
      8 * InstructionShape::kN / 32,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

  using EpilogueOutputOp1 = 
@ -156,11 +159,10 @@ void run_fused_gemm_s8_sm80() {
      ElementOutput,
      64 / cutlass::sizeof_bits<ElementOutput>::value,
      ElementAccumulator,
-      ElementCompute
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
    >;

-
-
  using B2bGemm = cutlass::gemm::device::B2bGemm<
    int8_t,
    cutlass::layout::ColumnMajorInterleaved<32>,
@ -183,8 +185,7 @@ void run_fused_gemm_s8_sm80() {
    16,
    16,
    false,
-    cutlass::arch::OpMultiplyAddSaturate,
-    true
+    cutlass::arch::OpMultiplyAddSaturate
  >;

  B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
@ -196,7 +197,6 @@ void run_fused_gemm_s8_sm80() {
  else
    std::cout << "Fail\n";

+  return passed;
 }
 ////////////////////////////////////////////////////////////////////////////////
-
-#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
--- a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
@ -115,9 +115,7 @@ template <
    /// Operation performed by GEMM
    typename Operator_ = typename DefaultGemmConfiguration<
        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Whether Beta is zero or not
-    bool IsBetaZero = false>
+        ElementAccumulator_>::Operator>
 class B2bGemm {
 public:

@ -148,7 +146,6 @@ class B2bGemm {
  static int const kAlignmentB = AlignmentB;
  static int const kAlignmentC = EpilogueOutputOp1::kCount;
  static bool const kSplitKSerial = SplitKSerial;
-  static bool const kIsBetaZero = IsBetaZero;
  static ComplexTransform const kTransformA = ComplexTransform::kNone;
  static ComplexTransform const kTransformB = ComplexTransform::kNone;

@ -175,8 +172,7 @@ class B2bGemm {
    ThreadblockSwizzle,
    kStages,
    kSplitKSerial,
-    Operator,
-    kIsBetaZero
+    Operator
  >::B2bGemmKernel;

  /// Argument structure
@ -422,7 +418,7 @@ public:
    void *workspace = nullptr, 
    cudaStream_t stream = nullptr) {
    
-    Status status = initialize(args, workspace);
+    Status status = initialize(args, workspace, stream);
    
    if (status == Status::kSuccess) {
      status = run(stream);
--- a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
@ -255,7 +255,7 @@ public:
    void *workspace = nullptr, 
    cudaStream_t stream = nullptr) {
    
-    Status status = initialize(args, workspace);
+    Status status = initialize(args, workspace, stream);
    
    if (status == Status::kSuccess) {
      status = run(stream);
--- a/examples/13_two_tensor_op_fusion/fused_conv2d.cu
+++ b/examples/13_two_tensor_op_fusion/fused_conv2d.cu
@ -28,53 +28,14 @@
 #include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h"
 #include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h"

-int run() {
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
-              << std::endl;
-
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-  std::cout << "Running on SM80" << std::endl;
-  run_nonfused_conv2d_fprop_optimized_f16_sm80();
-  run_fused_conv2d_fprop_optimized_f16_sm80();
-  run_nonfused_conv2d_fprop_optimized_s8_sm80();
-  run_fused_conv2d_fprop_optimized_s8_sm80();
-#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-  std::cout << "Running on SM75" << std::endl;
-  run_nonfused_conv2d_fprop_optimized_f16_sm75();
-  run_fused_conv2d_fprop_optimized_f16_sm75();
-  run_nonfused_conv2d_fprop_optimized_s8_sm75();
-  run_fused_conv2d_fprop_optimized_s8_sm75();
-#endif
-
-  return 0;
-}
-
-int main() {
-
+int run_sm75() {
  bool notSupported = false;

  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
  //
  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
-
    notSupported = true;
-
  }

  cudaDeviceProp props;
@ -85,10 +46,7 @@ int main() {
    return -1;
  }

-  if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
-              << std::endl;
-
+  if (!(props.major == 7 && props.minor >= 5)) {
    notSupported = true;
  }

@ -96,7 +54,83 @@ int main() {
    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
    return 0;
  }
-    
-  return run();
+
+  bool pass = 1;
+ 
+  std::cout << "Running on SM75" << std::endl;
+  pass &= run_nonfused_conv2d_fprop_optimized_f16_sm75();
+  pass &= run_fused_conv2d_fprop_optimized_f16_sm75();
+  pass &= run_nonfused_conv2d_fprop_optimized_s8_sm75();
+  pass &= run_fused_conv2d_fprop_optimized_s8_sm75();
+
+  if(pass)
+    return 1;
+  else
+    return -1;
+
+}
+
+int run_sm80() {
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major == 8 && props.minor >= 0)) {
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  bool pass = 1;
+ 
+  std::cout << "Running on SM80" << std::endl;
+  pass &= run_nonfused_conv2d_fprop_optimized_f16_sm80();
+  pass &= run_fused_conv2d_fprop_optimized_f16_sm80();
+  pass &= run_nonfused_conv2d_fprop_optimized_s8_sm80();
+  pass &= run_fused_conv2d_fprop_optimized_s8_sm80();
+
+  if(pass)
+    return 1;
+  else
+    return -1;
+
+}
+
+
+int main() {
+
+  int result = 0;
+
+  result = run_sm80();
+
+  if(!result) { // not supported
+    result = run_sm75();
+
+    if(!result) {
+      std::cout << "This example isn't supported on current architecture" << std::endl;
+    }
+
+  }
+
+  if(result >= 0)
+    return 0;
+  else
+    return -1;
 }

--- a/examples/13_two_tensor_op_fusion/fused_gemm.cu
+++ b/examples/13_two_tensor_op_fusion/fused_gemm.cu
@ -28,36 +28,15 @@
 #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
 #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h"

-int run() {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
-  std::cout << "Running on SM80" << std::endl;
-  run_nonfused_gemm_f16_sm80();
-  run_fused_gemm_f16_sm80();
-  run_nonfused_gemm_s8_sm80();
-  run_fused_gemm_s8_sm80();
-#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-  std::cout << "Running on SM75" << std::endl;
-  run_nonfused_gemm_f16();
-  run_fused_gemm_f16();
-  run_nonfused_gemm_s8();
-  run_fused_gemm_s8();
-#endif
-
-  return 0;
-}
-
-int main() {
-
+int run_sm75() {
  bool notSupported = false;

  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
  //
  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
-
    notSupported = true;
+
  }

  cudaDeviceProp props;
@ -68,10 +47,7 @@ int main() {
    return -1;
  }

-  if (!(props.major * 10 + props.minor >= 75)) {
-    std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
-              << std::endl;
-
+  if (!(props.major == 7 && props.minor >= 5)) {
    notSupported = true;
  }

@ -80,6 +56,86 @@ int main() {
    return 0;
  }

-  return run();
+  bool pass = true;
+ 
+  std::cout << "Running on SM75" << std::endl;
+  pass &= run_nonfused_gemm_f16();
+  pass &= run_fused_gemm_f16();
+  pass &= run_nonfused_gemm_s8();
+  pass &= run_fused_gemm_s8();
+
+  if(pass)
+    return 1;
+  else
+    return -1;
+    
+
 }

+int run_sm80() {
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    notSupported = true;
+
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major == 8 && props.minor >= 0)) {
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  bool pass = true;
+ 
+  std::cout << "Running on SM80" << std::endl;
+  pass &= run_nonfused_gemm_f16_sm80();
+  pass &= run_fused_gemm_f16_sm80();
+  pass &= run_nonfused_gemm_s8_sm80();
+  pass &= run_fused_gemm_s8_sm80();
+
+  if(pass)
+    return 1;
+  else
+    return -1;
+
+}
+
+
+int main() {
+
+  int result = 0;
+
+  result = run_sm80();
+
+  if(!result) { // not supported
+    result = run_sm75();
+
+    if(!result) {
+      std::cout << "This example isn't supported on current architecture" << std::endl;
+    }
+
+  }
+
+  if(result >= 0)
+    return 0;
+  else
+    return -1;
+}
+
+
+
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
@ -66,6 +66,7 @@ struct B2bGemm {
    cutlass::gemm::GemmCoord problem_size_0;
    cutlass::gemm::GemmCoord problem_size_1;
    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
    typename B2bMma::IteratorA0::Params params_A0;
    typename B2bMma::IteratorA0::TensorRef ref_A0;
    typename B2bMma::IteratorB0::Params params_B0;
@ -91,7 +92,7 @@ struct B2bGemm {
    //

    CUTLASS_HOST_DEVICE
-    Params(): semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
+    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
        gemm_k_iterations_1(0), gemm_k_size_1(0) { }

    CUTLASS_HOST_DEVICE
@ -112,6 +113,7 @@ struct B2bGemm {
      problem_size_0(problem_size_0),
      problem_size_1(problem_size_1),
      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
      params_A0(ref_A0.layout()),
      ref_A0(ref_A0),
      params_B0(ref_B0.layout()),
@ -211,7 +213,7 @@ struct B2bGemm {
    ThreadblockSwizzle threadblock_swizzle;

    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);

    // Early exit if CTA is out of range
    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
@ -315,7 +317,7 @@ struct B2bGemm {
    //

    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);

    //assume identity swizzle
    MatrixCoord threadblock_offset(
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
@ -209,6 +209,7 @@ struct B2bImplicitGemmConvolution {
    cutlass::gemm::GemmCoord grid_tiled_shape;
    gemm::GemmCoord implicit_gemm_problem_size_0;
    gemm::GemmCoord implicit_gemm_problem_size_1;
+    int swizzle_log_tile;
    int gemm_k_iterations_0;
    int gemm_k_iterations_1;
    typename B2bMma::IteratorA0::Params iterator_A0;
@ -233,7 +234,7 @@ struct B2bImplicitGemmConvolution {
    //

    CUTLASS_HOST_DEVICE
-    Params(): gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
+    Params(): swizzle_log_tile(0), gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }

    /// 
    CUTLASS_HOST_DEVICE
@ -245,7 +246,6 @@ struct B2bImplicitGemmConvolution {
      problem_size_1(args.problem_size_1),
      implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)),
      implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)),
-      grid_tiled_shape(grid_tiled_shape),
      iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())),
      ptr_A0(args.ref_A0.data()),
      iterator_B0(args.problem_size_0, args.ref_B0.layout()),
@ -272,6 +272,8 @@ struct B2bImplicitGemmConvolution {
        implicit_gemm_problem_size_0,
        {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
        args.problem_size_0.split_k_slices);
+
+      swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
    }
  };

@ -296,7 +298,7 @@ struct B2bImplicitGemmConvolution {
    ThreadblockSwizzle threadblock_swizzle;

    cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);

    // Early exit if CTA is out of range
    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
@ -379,7 +381,7 @@ struct B2bImplicitGemmConvolution {
    
    // Compute logical position within grid
    threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);

    // If performing a reduction via split-K, fetch the initial synchronization
    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
@ -111,9 +111,7 @@ template <
  /// If true, kernel is configured to support serial reduction in the epilogue
  bool SplitKSerial,
  /// Operation performed by GEMM
-  typename Operator,
-  /// Beta is zero or not
-  bool IsBetaZero = false
+  typename Operator
 >
 struct DefaultB2bGemm;

@ -321,9 +319,7 @@ template <
    /// epilogue
    bool SplitKSerial,
    /// Operation performed by GEMM
-    typename Operator,
-    /// Is Beta zero or not
-    bool IsBetaZero>
+    typename Operator>
 struct DefaultB2bGemm<
    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, 
@ -332,7 +328,7 @@ struct DefaultB2bGemm<
    ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
    InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
    ThreadblockSwizzle, Stages,
-    SplitKSerial, Operator, IsBetaZero> {
+    SplitKSerial, Operator> {
  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@ -353,8 +349,7 @@ struct DefaultB2bGemm<
  using Epilogue = typename cutlass::epilogue::threadblock::
      DefaultInterleavedEpilogueTensorOp<
          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
-          64 / sizeof_bits<ElementC>::value, InterleavedK,
-          IsBetaZero>::Epilogue;
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;

  /// Define the kernel-level GEMM operator.
  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
@ -397,9 +392,7 @@ template <
    /// epilogue
    bool SplitKSerial,
    /// Operation performed by GEMM
-    typename Operator,
-    /// Is Beta zero or not
-    bool IsBetaZero>
+    typename Operator>
 struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
                   kAlignmentA, ElementB,
                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
@ -407,7 +400,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
                   int32_t, arch::OpClassTensorOp, arch::Sm75, 
                   ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
                   InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
-                   ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> {
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator> {
  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@ -426,8 +419,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
  using Epilogue = typename cutlass::epilogue::threadblock::
      DefaultInterleavedEpilogueTensorOp<
          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
-          64 / sizeof_bits<ElementC>::value, InterleavedK,
-          IsBetaZero>::Epilogue;
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;

  /// Define the kernel-level GEMM operator.
  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@ -43,14 +43,122 @@ fp32 data by using NVIDIA Ampere architecture.

 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/command_line.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/device/gemm.h"
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_copy.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/tensor_view_io.h"
+
 #include "helper.h"

+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  float alpha;
+  float beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({5120, 4096, 4096}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "14_ampere_tf32_tensorop_gemm example\n\n"
+      << "  This example uses the CUTLASS Library to execute TF32 tensorop GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m <int>                   GEMM M dimension\n"
+      << "  --n <int>                   GEMM N dimension\n"
+      << "  --k <int>                   GEMM K dimension\n"
+      << "  --alpha <f32>               Epilogue scalar alpha\n"
+      << "  --beta <f32>                Epilogue scalar beta\n\n"
+      << "  --iterations <int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 // The code section below describes datatype for input, output matrices and computation between
 // elements in input matrices.
 using ElementAccumulator = float;                   // <- data type of accumulator
@ -111,14 +219,10 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
                                         SwizzleThreadBlock,
                                         NumStages>;

-int run() {
-
-  const int length_m = 5120;
-  const int length_n = 4096;
-  const int length_k = 4096;
+int run(Options &options) {

  // Create a tuple of problem size for matrix multiplication
-  cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
+  cutlass::gemm::GemmCoord problem_size = options.problem_size;

  // Initialize tensors using CUTLASS helper functions
  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
@ -166,8 +270,8 @@ int run() {
  tensor_ref_d.sync_device();

  // Initialize alpha and beta for dot product computation
-  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
-  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);

  // Split K dimension into 1 partitions
  int split_k_slices = 1;
@ -199,9 +303,74 @@ int run() {
  status = gemm_op.initialize(arguments, workspace.get());
  CUTLASS_CHECK(status);

-  // Launch initialized CUTLASS kernel
-  status = gemm_op();
-  CUTLASS_CHECK(status);
+  // Result structure
+  Result result;
+
+  //
+  // Construct events
+  //
+
+  cudaEvent_t events[2];
+
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+
+  // Record an event at the start of a series of GEMMs
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  //
+  // Run profiling loop
+  //
+
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    // Launch initialized CUTLASS kernel
+    status = gemm_op();
+    CUTLASS_CHECK(status);
+  }
+
+  //
+  // Stop profiling loop
+  //
+
+  // Record an event when the GEMMs are complete
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }

  // Create instantiation for device reference gemm kernel
  cutlass::reference::device::Gemm<ElementInputA,
@ -235,12 +404,17 @@ int run() {
    tensor_d.host_view(),
    tensor_ref_d.host_view());

+  if (passed) {
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+  }
+
  std::cout << (passed ? "Passed" : "Failed") << std::endl;

  return (passed ? 0  : -1);
 }

-int main() {
+int main(int argc, const char **argv) {
  
  bool notSupported = false;

@ -272,5 +446,21 @@ int main() {
    return 0;
  }

-  return run();
+  Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  printf("%d x %d x %d TF32 tensor op Matrix Multiply\n", \
+    options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
+
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  return run(options);
 }
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
@ -152,7 +152,7 @@ int run() {
  cutlass::HostTensor<ElementInputE, LayoutInputE> tensor_e(
      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
  // Same size as the above.  The above one needs to be reordered and stored in this one.
-  cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(  
+  cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(
      cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));

  // Fill input and output matrices on host using CUTLASS helper functions
--- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@ -158,7 +158,7 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw
 constexpr int NumStages = 3;

 // This code section describe iterator algorithm selected is Analytic or Optimized
-static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kAnalytic;
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;

 // This code section describes the epilogue part of the kernel, we use default value
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@ -189,7 +189,6 @@ using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<

 using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;

-
 /////////////////////////////////////////////////////////////////////////////////////////////////

 // Command line options parsing
@ -755,6 +754,3 @@ int main(int argc, char const **args) {
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
@ -0,0 +1,28 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  18_ampere_fp64_tensorop_affine2_gemm
+  ampere_fp64_tensorop_affine2_gemm.cu
+  )
+
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
@ -0,0 +1,336 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+In the normal GEMM, the fast changing dimension of a matrix always has stride 
+equals to 1, e.g. ColumnMajor and RowMajor matrix.  Affine2 matrix can have 
+larger than 1 stride in both dimensions.  To support such layout, we need to 
+change to method to visit the global memory:
+
+  1. We can only visit 1 element a time because elements are not stored
+     consecutively anymore.  Vectorized load/store is not possible.
+  2. One extra multiplication is needed in calculating the global memory
+     address
+     addr = base_pointer + coord1 * stride1 + coord2 * stride2
+
+The rest part of GEMM which includes shared memory load/store, mma comutation
+is the same.
+
+This example uses Ampere fp64 tensore core Affine2 GEMM as an example.  SIMT 
+(e.g. sgemm, dgemm) has support Affine2 layout.
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/kernel/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/matrix_coord.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using ElementAccumulator = double;                 // Data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation
+using ElementInputA = double;                      // Data type of elements in input tensor
+using ElementInputB = double;                      // Data type of elements in input tensor
+using ElementOutput = double;                      // Data type of elements in output tensor
+
+// Since Affine2 explicitly lists the strides of both dimensions, it does not really matter if 
+// it is columnmajor and rowmajor.  However, it helps CUTLASS to improve the load locality if 
+// CUTLASS can know which dimension of A/B operand has smaller stride or more dense.
+//
+// Affine2 ColumnMajor means the row stride is smaller and Affine2 RowMajor means the column 
+// stride is smaller.
+//
+// The Affine2 epilogue reuses AffineN epilogue so it does not need to specify column majore
+// or row major.
+using LayoutInputA = cutlass::layout::AffineRank2ColumnMajor;
+using LayoutInputB = cutlass::layout::AffineRank2RowMajor;
+using LayoutOutput = cutlass::layout::AffineRankN<2>;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 16>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<64, 32, 16>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;    // TensorCore instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                        // Data type of output matrix.
+    1,                                                    // The number of elements per memory
+                                                          // access has.  It has to be 1 for
+                                                          // affine2. 
+    ElementComputeEpilogue>;
+
+using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmUniversal<
+  ElementInputA, LayoutInputA, cutlass::ComplexTransform::kNone, 1, // AlignmentA has to be 1
+  ElementInputB, LayoutInputB, cutlass::ComplexTransform::kNone, 1, // AlignmentB has to be 1
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd
+>::GemmKernel;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int run() {
+
+  // Construct Gemm ProblemSize with user defined output size
+  cutlass::gemm::GemmCoord problem_size = {1024, 512, 1024};
+
+  // Stride factor shows the distance between two elements in the differnet dimensions.  The
+  // first data is the logical distance between two rows, the second is between two columns.
+  // CUTLASS has a utility tool cutlass::layout::Affine2Layout_Factory<Layout>::layout_factory
+  // to help to convert stride_factor to the two strides.
+  //
+  // It is also totally fine to compute the strides directly without using the utility to 
+  // construct the affine2 layout.
+  typename LayoutInputA::Stride::Index stride_factor_A[] = {3, 4}; 
+  typename LayoutInputB::Stride::Index stride_factor_B[] = {5, 6};
+  typename LayoutOutput::Stride::Index stride_factor_C[] = {7, 8};
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(problem_size.mk(),
+         cutlass::layout::Affine2Layout_Factory<LayoutInputA>::layout_factory(problem_size.mk(),
+                                                                              stride_factor_A));
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(problem_size.kn(),
+         cutlass::layout::Affine2Layout_Factory<LayoutInputB>::layout_factory(problem_size.kn(),
+                                                                              stride_factor_B));
+
+  // Create matrix C used to load for bias addition.
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(problem_size.mn(),
+         cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
+                                                                              stride_factor_C));
+
+  // Create matrix D used to store output from CUTLASS kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(problem_size.mn(),
+         cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
+                                                                              stride_factor_C));
+
+  // Create matrix D with dimensions M x N used to store output from reference
+  // kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(problem_size.mn(),
+         cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
+                                                                              stride_factor_C));
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      0);  // <- Fill matrix A on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputB(4),
+      ElementInputB(-4),
+      0);  // <- Fill matrix B on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(1);
+
+  cutlass::gemm::GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+
+  int batch_count = 1;
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Gemm::Arguments arguments{
+    mode,
+    problem_size,
+    batch_count,
+    {alpha, beta},
+    tensor_a.device_ref().data(),              // <- reference to matrix A on device
+    tensor_b.device_ref().data(),              // <- reference to matrix B on device
+    tensor_c.device_ref().data(),              // <- reference to matrix C on device
+    tensor_d.device_ref().data(),              // <- reference to matrix D on device
+    tensor_a.layout().capacity(problem_size.mn()),
+    tensor_b.layout().capacity(problem_size.kn()),
+    tensor_c.layout().capacity(problem_size.mn()),
+    tensor_d.layout().capacity(problem_size.mn()),
+    tensor_a.layout().stride(),
+    tensor_b.layout().stride(),
+    tensor_c.layout().stride(),
+    tensor_d.layout().stride()
+  };                    
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm_op;
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Check the problem size is supported or not
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+
+  // Launch initialized CUTLASS kernel
+  status = gemm_op();
+
+  CUTLASS_CHECK(status);
+
+  //
+  // Create instantiation for device reference gemm kernel
+  //
+
+  // Launch device reference to compute strictly the product A * B
+  cutlass::reference::device::Gemm<
+      ElementInputA, 
+      LayoutInputA, 
+      ElementInputB, 
+      LayoutInputB, 
+      ElementOutput,
+      LayoutOutput, 
+      ElementComputeEpilogue, 
+      ElementAccumulator> gemm_device;
+
+  gemm_device
+    (
+      problem_size,
+      alpha, 
+      tensor_a.device_ref(),
+      tensor_b.device_ref(),
+      beta, 
+      tensor_c.device_ref(), 
+      tensor_ref_d.device_ref()
+    );
+
+  // Wait for kernels to finish
+  cudaDeviceSynchronize();
+
+  // Copy output data from CUTLASS and reference kernel to host for comparison
+  tensor_d.sync_host();
+  tensor_ref_d.sync_host();
+
+  bool pass = cutlass::reference::host::TensorEquals(tensor_d.host_view(),
+                                                     tensor_ref_d.host_view());
+
+  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  std::cout << (pass
+                    ? "Passed"
+                    : "Failed")
+            << std::endl;
+
+  CUTLASS_CHECK(status);
+
+  return 0;
+}
+
+int main(int argc, char const **args) {
+
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+    std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  return run();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/19_tensorop_canonical/CMakeLists.txt
+++ b/examples/19_tensorop_canonical/CMakeLists.txt
@ -0,0 +1,27 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  19_tensorop_canonical
+  tensorop_canonical.cu
+)
+
--- a/examples/19_tensorop_canonical/tensorop_canonical.cu
+++ b/examples/19_tensorop_canonical/tensorop_canonical.cu
@ -0,0 +1,432 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+  This example requires NVIDIA Ampere GPU or later.
+*/
+
+// Standard Library includes
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+// CUTLASS Includes
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+
+// CUTLASS Utility Includes
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Define the overal warp-level problem shape
+int const kM = 27;
+int const kN = 31;
+int const kK = 17;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Define a warp-level GEMM operator.
+//
+// This template could be part of the CUTLASS Template Library or implemented internally. This
+// wraps the matrix multiply operation and epilogue with a GEMM-like interface that can be
+// instantiated in device code.
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+  typename Shape,
+  typename InstructionShape,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementScalar
+>
+class GemmTensorOp {
+public:
+
+  using WarpShape = GemmShape<
+    ((Shape::kM + InstructionShape::kM - 1) / InstructionShape::kM) * InstructionShape::kM,
+    ((Shape::kN + InstructionShape::kN - 1) / InstructionShape::kN) * InstructionShape::kN,
+    InstructionShape::kK
+  >;
+
+  using MmaWarp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+    WarpShape,
+    InstructionShape,
+    double,                             // Data type of A elements
+    cutlass::layout::RowMajor,          // Layout of A matrix
+    double,                             // Data type of B elements
+    cutlass::layout::ColumnMajor,       // Layout of B matrix
+    double,                             // Data type of C elements
+    cutlass::layout::RowMajor           // Layout of C matrix
+  >::Type;
+ 
+  // Number of 'K groups' 
+  int const kKgroups = (Shape::kK + InstructionShape::kK - 1) / InstructionShape::kK;
+
+  // Define a 'FragmentIterator' to iterate over slices of accumulators
+  using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+    typename MmaWarp::Shape,
+    InstructionShape,
+    double,
+    typename MmaWarp::Policy::Operator::FragmentC,
+    cutlass::layout::RowMajor
+  >;
+
+  // Define an epilogue 'Tile Iteterator' to iterate over slices of elements in Shared Memory
+  using AccumulatorTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpCanonical<
+    typename MmaWarp::Shape,
+    InstructionShape,
+    double,
+    cutlass::layout::RowMajor
+  >;
+
+  using TensorRefA = typename MmaWarp::IteratorA::TensorRef;
+  using TensorRefB = typename MmaWarp::IteratorB::TensorRef;
+  using TensorRefC = typename AccumulatorTileIterator::TensorRef;
+
+public:
+  CUTLASS_HOST_DEVICE
+  GemmTensorOp() { }
+
+  CUTLASS_DEVICE
+  void operator()(
+    ElementScalar alpha, 
+    TensorRefA ref_A, 
+    TensorRefB ref_B, 
+    ElementScalar beta,
+    TensorRefC ref_C,
+    TensorRefC ref_D,
+    int lane_id) const {
+  
+    // Instantiate iterators pointing to slices of the A and B matrices in shared memory
+    typename MmaWarp::IteratorA iter_A(ref_A, {Shape::kM, Shape::kK}, lane_id);
+    typename MmaWarp::IteratorB iter_B(ref_B, {Shape::kK, Shape::kN}, lane_id);
+
+    // Instantiate and clear accumulator tile holding the C matrix
+    typename MmaWarp::FragmentC accum;
+    accum.clear();
+  
+    // Instantiate the warp-level matrix multiply operator
+    MmaWarp mma_op;
+
+    // Instantiate fragments holding the slice of the matrix held by each warp
+    typename MmaWarp::FragmentA frag_A[2];
+    typename MmaWarp::FragmentB frag_B[2];
+      
+    // Load fragments from shared memory
+    iter_A.load(frag_A[0]);
+    iter_B.load(frag_B[0]);
+
+    ++iter_A;
+    ++iter_B;
+
+    // Load fragments from shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < kKgroups; ++k) {
+
+      // Load fragments from shared memory
+      iter_A.load(frag_A[(k + 1) % 2]);
+      iter_B.load(frag_B[(k + 1) % 2]);
+
+      ++iter_A;
+      ++iter_B;
+
+      // Compute the matrix multiply
+      mma_op(accum, frag_A[k % 2], frag_B[k % 2], accum);
+    }
+  
+    // Instantiate iterators
+    FragmentIterator accum_frag_it(accum);
+    AccumulatorTileIterator source_tile_it(ref_C, {Shape::kM, Shape::kN}, lane_id);
+    AccumulatorTileIterator dest_tile_it(ref_D, {Shape::kM, Shape::kN}, lane_id);
+
+    // Define function objects for linear scaling operation
+    cutlass::multiplies<typename FragmentIterator::Fragment> mul_source;
+    cutlass::multiply_add<typename FragmentIterator::Fragment> mul_add_accumulator;
+
+    // Iterate over the epilogue components
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx = 0; idx < FragmentIterator::kIterations; ++idx) {
+
+      // Define storage for slices of the accumulators
+      typename FragmentIterator::Fragment accum_fragment;
+      typename FragmentIterator::Fragment source_fragment;
+
+      // Select a slice of accumulators from the accumulator tile
+      accum_frag_it.load(accum_fragment);
+      ++accum_frag_it;
+
+      // Load a corresponding slice from Shared memory
+      source_tile_it.load(source_fragment);
+      ++source_tile_it;
+
+      // Compute linear scaling - alpha * AB + beta * C
+      source_fragment = mul_source(beta, source_fragment);
+      accum_fragment = mul_add_accumulator(alpha, accum_fragment, source_fragment);
+
+      // Store the result to shared memory
+      dest_tile_it.store(accum_fragment);
+      ++dest_tile_it;
+    }
+  }
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sample kernel demonstrating a collective GEMM operation by a warp on arbitrary matrices held
+// in Shared Memory.
+__global__ void kernel(
+  double *D_gmem, 
+  double alpha, 
+  double const *A_gmem, 
+  double const *B_gmem, 
+  double beta,
+  double const *C_gmem) {
+
+  // Define several matrices in shared memory
+  __shared__ double A[kM][kK];
+  __shared__ double B[kN][kK];
+  __shared__ double C[kM][kN];
+
+  // Copy data into SMEM
+  if (threadIdx.x == 0) {
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int m = 0; m < kM; ++m) {
+      for (int k = 0; k < kK; ++k) {
+        A[m][k] = A_gmem[m * kK + k];
+      }
+    }
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int n = 0; n < kN; ++n) {
+      for (int k = 0; k < kK; ++k) {
+        B[n][k] = B_gmem[n * kK + k];
+      }
+    }
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int m = 0; m < kM; ++m) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int n = 0; n < kN; ++n) {
+        C[m][n] = C_gmem[m * kN + n];
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  //
+  // Instantiate a warp-level matrix multiply operator given the fundamental instruction shape (8x8x4),
+  // overall shape, data type of each operand, and layout of each operand.
+  //
+
+  using GemmTensorOp = cutlass::gemm::warp::GemmTensorOp<
+    cutlass::gemm::GemmShape<kM, kN, kK>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    double,                             // Data type of A elements
+    cutlass::layout::RowMajor,          // Layout of A matrix
+    double,                             // Data type of B elements
+    cutlass::layout::ColumnMajor,       // Layout of B matrix
+    double,                             // Data type of C elements
+    cutlass::layout::RowMajor,          // Layout of C matrix
+    double                              // Scalar type of alpha and beta
+  >;
+
+  // Instantiate the GEMM operator
+  GemmTensorOp gemm;
+
+  // Execute the warp-level GEMM operation
+  gemm(
+    alpha, 
+    {&A[0][0], kK},
+    {&B[0][0], kK},
+    beta,
+    {&C[0][0], kN},
+    {&C[0][0], kN},
+    threadIdx.x);
+
+  __syncthreads();
+  
+  // Copy data into SMEM
+  if (threadIdx.x == 0) {
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int m = 0; m < kM; ++m) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int n = 0; n < kN; ++n) {
+        D_gmem[m * kN + n] = C[m][n];
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Entry point to canonical warp-level GEMM operation
+int main(int argc, const char *arg[]) {
+
+  bool notSupported = false;
+
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+    std::cerr << "NVIDIA Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::cerr << "This example requires compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Return 0 so tests are considered passing if run on unsupported platforms.
+    return 0;
+  }
+
+  cutlass::HostTensor<double, cutlass::layout::RowMajor> A({kM, kK});
+  cutlass::HostTensor<double, cutlass::layout::ColumnMajor> B({kK, kN});
+  cutlass::HostTensor<double, cutlass::layout::RowMajor> C({kM, kN});
+  cutlass::HostTensor<double, cutlass::layout::RowMajor> D({kM, kN});
+
+  uint64_t seed = 2020;
+  double max = 8;
+  double min = -8;
+
+  cutlass::reference::host::TensorFillRandomUniform(
+    A.host_view(),
+    seed,
+    max,
+    min,
+    0
+  );
+
+  cutlass::reference::host::TensorFillRandomUniform(
+    B.host_view(),
+    seed + 17,
+    max,
+    min,
+    0
+  );
+
+  cutlass::reference::host::TensorFillRandomUniform(
+    C.host_view(),
+    seed + 31,
+    max,
+    min,
+    0
+  );
+
+  A.sync_device();
+  B.sync_device();
+  C.sync_device();
+  D.sync_device();
+
+  dim3 grid(1,1);
+  dim3 block(32, 1, 1);
+
+  double alpha = 2.25;
+  double beta = 1.24;
+
+  kernel<<< grid, block >>>(
+    D.device_data(),
+    alpha,
+    A.device_data(),
+    B.device_data(),
+    beta,
+    C.device_data()
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to synchronize device after kernel launch." << std::endl;
+    return -1;
+  }
+
+  D.sync_host();
+  
+  // Compute reference on host
+  cutlass::HostTensor<double, cutlass::layout::RowMajor> D_ref({kM, kN}, false);
+
+  cutlass::reference::host::GemmComplex(
+    {kM, kN, kK},
+    alpha,
+    A.host_ref(),
+    cutlass::ComplexTransform::kNone,
+    B.host_ref(),
+    cutlass::ComplexTransform::kNone,
+    beta,
+    C.host_ref(),
+    D_ref.host_ref(),
+    double()
+  );
+
+  // Verify reference matches computed
+  if (!cutlass::reference::host::TensorEquals(
+    D.host_view(),
+    D_ref.host_view())) {
+
+    std::cerr 
+      << "A =\n" << A.host_view() 
+      << "\n\nB = \n" << B.host_view() 
+      << "\n\nC = " << C.host_view() 
+      << "\n\nRef =\n" << D_ref.host_view()
+      << "\n\nD =\n" << D.host_view() << "\n\n";
+
+    std::cerr << "Error - device results mismatch host reference." << std::endl;
+
+    return -1;
+  }
+
+  std::cout << "Passed" << std::endl;
+
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/20_simt_canonical/CMakeLists.txt
+++ b/examples/20_simt_canonical/CMakeLists.txt
@ -0,0 +1,27 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  20_simt_canonical
+  simt_canonical.cu
+)
+
--- a/examples/20_simt_canonical/simt_canonical.cu
+++ b/examples/20_simt_canonical/simt_canonical.cu
@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+  This example requires NVIDIA Maxwell GPU or beyond.
+*/
+
+// Standard Library includes
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+// CUTLASS Includes
+#include "cutlass/cutlass.h"
+#include "cutlass/core_io.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/epilogue/warp/fragment_iterator_simt.h"
+#include "cutlass/epilogue/warp/tile_iterator_simt.h"
+
+// CUTLASS Utility Includes
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Define the overal warp-level problem shape
+int const kM = 14;
+int const kN = 27;
+int const kK = 17;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Define a warp-level GEMM operator.
+//
+// This template could be part of the CUTLASS Template Library or implemented internally. This
+// wraps the matrix multiply operation and epilogue with a GEMM-like interface that can be
+// instantiated in device code.
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+  typename Shape,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementScalar
+>
+class GemmSimt {
+public:
+
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+    cutlass::MatrixShape<4, 8>,
+    cutlass::layout::RowMajorInterleaved<2>,
+    cutlass::gemm::GemmShape<4, 4, 1>
+  >;
+
+  using MmaWarp = cutlass::gemm::warp::MmaSimt<
+    cutlass::gemm::GemmShape<16, 32, 8>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    Policy
+  >;
+
+  // Number of 'K groups'
+  int const kKgroups = Shape::kK;
+
+  using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename MmaWarp::Shape,
+    typename MmaWarp::ThreadMma,
+    layout::RowMajor,                // SMEM layout
+    typename MmaWarp::Policy
+  >;
+
+  using AccumulatorTileIterator = cutlass::epilogue::warp::TileIteratorSimtCanonical<
+    typename MmaWarp::Shape,
+    typename MmaWarp::ThreadMma,
+    float,                             // ElementAccumulator
+    layout::RowMajor,                  // SMEM layout
+    typename MmaWarp::Policy
+  >;
+
+  using TensorRefA = typename MmaWarp::IteratorA::TensorRef;
+  using TensorRefB = typename MmaWarp::IteratorB::TensorRef;
+  using TensorRefC = typename AccumulatorTileIterator::TensorRef;
+
+public:
+  CUTLASS_HOST_DEVICE
+  GemmSimt() { }
+
+  CUTLASS_DEVICE
+  void operator()(
+    ElementScalar alpha, 
+    TensorRefA ref_A, 
+    TensorRefB ref_B, 
+    ElementScalar beta,
+    TensorRefC ref_C,
+    TensorRefC ref_D,
+    int lane_id) const {
+
+    // Instantiate iterators pointing to slices of the A and B matrices in shared memory
+    typename MmaWarp::IteratorA iter_A(ref_A, {Shape::kM, Shape::kK}, lane_id);
+    typename MmaWarp::IteratorB iter_B(ref_B, {Shape::kK, Shape::kN}, lane_id);
+
+    // Instantiate and clear accumulator tile holding the C matrix
+    typename MmaWarp::FragmentC accum;
+    accum.clear();
+  
+    // Instantiate the warp-level matrix multiply operator
+    MmaWarp mma_op;
+
+    // Instantiate fragments holding the slice of the matrix held by each warp
+    typename MmaWarp::FragmentA frag_A[2];
+    typename MmaWarp::FragmentB frag_B[2];
+      
+    // Load fragments from shared memory
+    iter_A.load(frag_A[0]);
+    iter_B.load(frag_B[0]);
+
+    ++iter_A;
+    ++iter_B;
+
+    // Load fragments from shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < kKgroups; ++k) {
+
+      // Load fragments from shared memory
+      iter_A.load(frag_A[(k + 1) % 2]);
+      iter_B.load(frag_B[(k + 1) % 2]);
+
+      ++iter_A;
+      ++iter_B;
+
+      // Compute the matrix multiply
+      mma_op(accum, frag_A[k % 2], frag_B[k % 2], accum);
+    }
+  
+    // Instantiate iterators
+    FragmentIterator accum_frag_it(accum);
+    AccumulatorTileIterator source_tile_it(ref_C, {Shape::kM, Shape::kN}, lane_id);
+    AccumulatorTileIterator dest_tile_it(ref_D, {Shape::kM, Shape::kN}, lane_id);
+
+    // Define function objects for linear scaling operation
+    cutlass::multiplies<typename FragmentIterator::Fragment> mul_source;
+    cutlass::multiply_add<typename FragmentIterator::Fragment> mul_add_accumulator;
+
+    // Iterate over the epilogue components
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx = 0; idx < FragmentIterator::kIterations; ++idx) {
+
+      // Define storage for slices of the accumulators
+      typename FragmentIterator::Fragment accum_fragment;
+      typename FragmentIterator::Fragment source_fragment;
+
+      // Select a slice of accumulators from the accumulator tile
+      accum_frag_it.load(accum_fragment);
+      ++accum_frag_it;
+
+      // Load a corresponding slice from Shared memory
+      source_tile_it.load(source_fragment);
+      ++source_tile_it;
+
+      // Compute linear scaling - alpha * AB + beta * C
+      source_fragment = mul_source(beta, source_fragment);
+      accum_fragment = mul_add_accumulator(alpha, accum_fragment, source_fragment);
+
+      // Store the result to shared memory
+      dest_tile_it.store(accum_fragment);
+      ++dest_tile_it;
+    }
+
+  }
+
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sample kernel demonstrating a collective GEMM operation by a warp on arbitrary matrices held
+// in Shared Memory.
+__global__ void kernel(
+  float *D_gmem, 
+  float alpha, 
+  float const *A_gmem, 
+  float const *B_gmem, 
+  float beta,
+  float const *C_gmem) {
+
+  // Define several matrices in shared memory
+  __shared__ float A[kM][kK];
+  __shared__ float B[kN][kK];
+  __shared__ float C[kM][kN];
+
+  // Copy data into SMEM
+  if (threadIdx.x == 0) {
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int m = 0; m < kM; ++m) {
+      for (int k = 0; k < kK; ++k) {
+        A[m][k] = A_gmem[m * kK + k];
+      }
+    }
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int n = 0; n < kN; ++n) {
+      for (int k = 0; k < kK; ++k) {
+        B[n][k] = B_gmem[n * kK + k];
+      }
+    }
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int m = 0; m < kM; ++m) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int n = 0; n < kN; ++n) {
+        C[m][n] = C_gmem[m * kN + n];
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  //
+  // Instantiate a warp-level matrix multiply operator given the fundamental instruction shape (8x8x4),
+  // overall shape, data type of each operand, and layout of each operand.
+  //
+
+  using GemmSimt = cutlass::gemm::warp::GemmSimt<
+    cutlass::gemm::GemmShape<kM, kN, kK>,
+    float,                             // Data type of A elements
+    cutlass::layout::RowMajor,          // Layout of A matrix
+    float,                             // Data type of B elements
+    cutlass::layout::ColumnMajor,       // Layout of B matrix
+    float,                             // Data type of C elements
+    cutlass::layout::RowMajor,          // Layout of C matrix
+    float                              // Scalar type of alpha and beta
+  >;
+
+  // Instantiate the GEMM operator
+  GemmSimt gemm;
+
+  // Execute the warp-level GEMM operation
+  gemm(
+    alpha, 
+    {&A[0][0], kK},
+    {&B[0][0], kK},
+    beta,
+    {&C[0][0], kN},
+    {&C[0][0], kN},
+    threadIdx.x);
+
+  __syncthreads();
+  
+  // Copy data into SMEM
+  if (threadIdx.x == 0) {
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int m = 0; m < kM; ++m) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int n = 0; n < kN; ++n) {
+        D_gmem[m * kN + n] = C[m][n];
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, const char *arg[]) { 
+
+  cutlass::HostTensor<float, cutlass::layout::RowMajor> A({kM, kK});
+  cutlass::HostTensor<float, cutlass::layout::ColumnMajor> B({kK, kN});
+  cutlass::HostTensor<float, cutlass::layout::RowMajor> C({kM, kN});
+  cutlass::HostTensor<float, cutlass::layout::RowMajor> D({kM, kN});
+
+  uint64_t seed = 2020;
+  float max = 8;
+  float min = -8;
+
+  std::cout << "Simt canonical GEMM problem size = (" << cutlass::gemm::GemmShape<kM, kN, kK>() <<")" << std::endl;
+
+  cutlass::reference::host::TensorFillRandomUniform(
+    A.host_view(),
+    seed,
+    max,
+    min,
+    0
+  );
+
+  cutlass::reference::host::TensorFillRandomUniform(
+    B.host_view(),
+    seed + 17,
+    max,
+    min,
+    0
+  );
+
+#if 0 // Debug: fill A sequentially and B as Identity matrix for debugging
+  cutlass::reference::host::BlockFillSequential(
+        A.host_view().data(), A.host_view().capacity());
+
+  cutlass::reference::host::TensorFillIdentity(B.host_view());
+#endif
+
+  cutlass::reference::host::TensorFillRandomUniform(
+    C.host_view(),
+    seed + 31,
+    max,
+    min,
+    0
+  );
+
+  A.sync_device();
+  B.sync_device();
+  C.sync_device();
+  D.sync_device();
+
+  dim3 grid(1, 1);
+  dim3 block(32, 1, 1);
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+
+  kernel<<< grid, block >>>(
+    D.device_data(),
+    alpha,
+    A.device_data(),
+    B.device_data(),
+    beta,
+    C.device_data()
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to synchronize device after kernel launch." << std::endl;
+    return -1;
+  }
+
+  D.sync_host();
+
+  // Compute reference on host
+  cutlass::HostTensor<float, cutlass::layout::RowMajor> D_ref({kM, kN}, false);
+  cutlass::reference::host::TensorCopy(D_ref.host_view(), C.host_view());
+
+  cutlass::reference::host::Gemm<
+  float, cutlass::layout::RowMajor, 
+  float, cutlass::layout::ColumnMajor,
+  float, cutlass::layout::RowMajor, 
+  float, float> reference_gemm;
+
+  reference_gemm(
+    {kM, kN, kK},
+    alpha,
+    A.host_ref(),
+    B.host_ref(),
+    beta,
+    D_ref.host_ref(),
+    float()
+  );
+
+  // Verify reference matches computed
+  if (!cutlass::reference::host::TensorEquals(
+    D.host_view(),
+    D_ref.host_view())) {
+
+    std::cerr 
+      << "A =\n" << A.host_view() 
+      << "\n\nB = \n" << B.host_view() 
+      << "\n\nC = " << C.host_view() 
+      << "\n\nRef =\n" << D_ref.host_view()
+      << "\n\nD =\n" << D.host_view() << "\n\n";
+
+    std::cerr << "Error - device results mismatch host reference." << std::endl;
+
+    return -1;
+  }
+
+  std::cout << "Passed" << std::endl;
+
+  return 0; 
+
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/21_quaternion_gemm/CMakeLists.txt
+++ b/examples/21_quaternion_gemm/CMakeLists.txt
@ -0,0 +1,27 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  21_quaternion_gemm
+  quaternion_gemm.cu
+)
+  
--- a/examples/21_quaternion_gemm/quaternion_gemm.cu
+++ b/examples/21_quaternion_gemm/quaternion_gemm.cu
@ -0,0 +1,448 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::Quaternion<float> alpha;
+  cutlass::Quaternion<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha",   alpha.w());
+    cmd.get_cmd_line_argument("alpha_i", alpha.x());
+    cmd.get_cmd_line_argument("alpha_j", alpha.y());
+    cmd.get_cmd_line_argument("alpha_k", alpha.z());
+
+    cmd.get_cmd_line_argument("beta",   beta.w());
+    cmd.get_cmd_line_argument("beta_i", beta.x());
+    cmd.get_cmd_line_argument("beta_j", beta.y());
+    cmd.get_cmd_line_argument("beta_k", beta.z());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "21_quaternion_gemm example\n\n"
+      << "  This example uses the CUTLASS Library to execute Quaternion GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m <int>                   GEMM M dimension\n"
+      << "  --n <int>                   GEMM N dimension\n"
+      << "  --k <int>                   GEMM K dimension\n"
+      << "  --batch <int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha <f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i <f32>             Epilogue scalar alpha_i (imaginary part)\n"
+      << "  --alpha_j <f32>             Epilogue scalar alpha_j (imaginary part)\n"
+      << "  --alpha_k <f32>             Epilogue scalar alpha_k (imaginary part)\n"
+      << "  --beta <f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i <f32>              Epilogue scalar beta_i (imaginary part)\n\n"
+      << "  --beta_j <f32>              Epilogue scalar beta_j (imaginary part)\n\n"
+      << "  --beta_k <f32>              Epilogue scalar beta_k (imaginary part)\n\n"
+      << "  --iterations <int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/21_quaternion_gemm/21_quaternion_gemm  --batch=7 --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 16;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using precision = float;
+using Element = cutlass::Quaternion<float>;
+using ElementComputeEpilogue = Element;  // <- data type of epilogue operations
+using ElementAccumulator = Element;      // <- data type of accumulator
+using ElementInputA = Element;           // <- data type of elements in input matrix A
+using ElementInputB = Element;           // <- data type of elements in input matrix B
+using ElementOutput = Element;           // <- data type of elements in output matrix D
+
+// The code section below describes matrix layout of input and output matrices. Column Major for
+// Matrix A, Row Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = cutlass::layout::RowMajor;
+using LayoutInputB = cutlass::layout::ColumnMajor;
+using LayoutOutput = cutlass::layout::RowMajor;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassSimt;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm50;
+
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    cutlass::gemm::GemmShape<64, 64, 4>;                   // <- threadblock tile M = 64, N = 64, K = 8
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 16, 4>;  // <- warp tile M = 32, N = 16, K = 8
+// This code section describes the size of MMA op
+using ShapeMMAOp = cutlass::gemm::GemmShape<1, 1, 1>;      // <- MMA Op tile M = 1, N = 1, K = 1
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- Defaults
+
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                    // <- data type of output matrix
+    128 / cutlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
+                                                      // memory access. For a byte, it's 16
+                                                      // elements. This becomes the vector width of
+                                                      // math instructions in the epilogue too
+    ElementAccumulator,                               // <- data type of accumulator
+    ElementComputeEpilogue>;                          // <- data type for alpha/beta in linear combination function
+
+// Number of pipelines you want to use
+constexpr int NumStages = 2;
+
+using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
+                                         LayoutInputA,
+                                         ElementInputB,
+                                         LayoutInputB,
+                                         ElementOutput,
+                                         LayoutOutput,
+                                         ElementAccumulator,
+                                         MMAOp,
+                                         SmArch,
+                                         ShapeMMAThreadBlock,
+                                         ShapeMMAWarp,
+                                         ShapeMMAOp,
+                                         EpilogueOp,
+                                         SwizzleThreadBlock,
+                                         NumStages>;
+
+int run(Options options) {
+
+  // PASS/FAIL status
+  bool passed = true;
+
+  // Create a tuple of problem size for matrix multiplication
+  cutlass::gemm::GemmCoord problem_size = options.problem_size;
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      problem_size.mk());  // <- Create matrix A with dimensions M x K
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+      problem_size.kn());  // <- Create matrix B with dimensions K x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+      problem_size.mn());  // <- Create matrix C with dimensions M x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // CUTLASS kernel
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      4,
+      -4,
+      0);  // <- Fill matrix A on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      4,
+      -4,
+      0);  // <- Fill matrix B on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      4,
+      -4,
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Gemm::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
+                                     tensor_a.device_ref(),  // <- reference to matrix A on device
+                                     tensor_b.device_ref(),  // <- reference to matrix B on device
+                                     tensor_c.device_ref(),  // <- reference to matrix C on device
+                                     tensor_d.device_ref(),  // <- reference to matrix D on device
+                                     {alpha, beta},          // <- tuple of alpha and beta
+                                     split_k_slices};        // <- k-dimension split factor
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm_op;
+
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+  
+  // Result structure
+  Result result;
+
+  //
+  // Construct events
+  //
+
+  cudaEvent_t events[2];
+
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+
+  // Record an event at the start of a series of GEMMs
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  //
+  // Run profiling loop
+  //
+
+  for (int iter = 0; iter < options.iterations; ++iter) {
+
+    // Launch initialized CUTLASS kernel
+    status = gemm_op();
+    CUTLASS_CHECK(status);
+
+  }
+
+  //
+  // Stop profiling loop
+  //
+
+  // Record an event when the GEMMs are complete
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+
+  if (options.reference_check) {
+
+    // Create instantiation for device reference gemm kernel
+    cutlass::reference::device::Gemm<ElementInputA,
+                                     LayoutInputA,
+                                     ElementInputB,
+                                     LayoutInputB,
+                                     ElementOutput,
+                                     LayoutOutput,
+                                     ElementComputeEpilogue,
+                                     ElementComputeEpilogue> gemm_device;
+
+    // Launch device reference gemm kernel
+    gemm_device(problem_size,
+                alpha,
+                tensor_a.device_ref(),
+                tensor_b.device_ref(),
+                beta,
+                tensor_c.device_ref(),
+                tensor_ref_d.device_ref());
+
+    // Wait for kernels to finish
+    cudaDeviceSynchronize();
+
+    // Copy output data from CUTLASS and reference kernel to host for comparison
+    tensor_d.sync_host();
+    tensor_ref_d.sync_host();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    passed &= cutlass::reference::host::TensorEquals(
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
+
+  }
+
+  if (passed) {
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+  }
+
+  std::cout << (passed ? "Passed" : "Failed") << std::endl;
+  return (passed ? 0  : -1);
+}
+
+int main(int argc, char const** argv) {
+
+  Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  printf("%d x %d x %d Single Precision Quaternion Matrix Multiply\n", \
+    options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
+
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  return run(options);
+}
+
--- a/examples/22_quaternion_conv/CMakeLists.txt
+++ b/examples/22_quaternion_conv/CMakeLists.txt
@ -0,0 +1,28 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  22_quaternion_conv
+  quaternion_conv.cu 
+  )
+
--- a/examples/22_quaternion_conv/quaternion_conv.cu
+++ b/examples/22_quaternion_conv/quaternion_conv.cu
@ -0,0 +1,660 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+// The code section below describes datatype for input, output tensors and computation between
+// elements 
+using Element = cutlass::Quaternion<float>;
+using ElementAccumulator = Element;                  // Data type of accumulator
+using ElementComputeEpilogue = Element;              // Data type of epilogue computation (alpha, beta)
+using ElementInputA = Element;                       // Data type of elements in input tensor
+using ElementInputB = Element;                       // Data type of elements in input tensor
+using ElementOutput = Element;                       // Data type of elements in output tensor
+
+using LayoutInputA = cutlass::layout::TensorNHWC;
+using LayoutInputB = cutlass::layout::TensorNHWC;
+using LayoutOutput = cutlass::layout::TensorNHWC;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassSimt;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm50;
+
+// This code section describes the tile size a thread block will compute
+using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>;  // Threadblock tile shape
+
+// This code section describes tile size a warp will compute
+using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;         // Warp tile shape
+
+// This code section describes the size of MMA op
+using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;    // SIMT instruction shape
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+// Number of pipelines you want to use
+constexpr int NumStages = 2;
+
+// This code section describe iterator algorithm selected is Analytic or Optimized
+static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
+
+// This code section describes the epilogue part of the kernel, we use default value
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                     // Data type of output matrix.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized.
+                                                       // memory access. This becomes the vector width of
+                                                       // math instructions in the epilogue too.
+    ElementAccumulator,                                // Data type of accumulator
+    ElementComputeEpilogue>;                           // Data type for alpha/beta in linear combination
+
+
+using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
+  ElementOutput, LayoutOutput,
+  ElementAccumulator,
+  MMAOp,
+  SmArch,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOp,
+  SwizzleThreadBlock,
+  NumStages,
+  cutlass::arch::OpMultiplyAdd,
+  IteratorAlgorithm
+>::Kernel;
+
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+  bool reference_check;
+  bool measure_performance;
+  int iterations;
+  bool save_workspace;
+  ElementComputeEpilogue alpha;
+  ElementComputeEpilogue beta;
+  bool benchmark;
+  std::string tag;
+
+  Options():
+    help(false),
+    input_size(1, 32, 32, 32),
+    filter_size(32, 3, 3, 32),
+    padding(1, 1, 1, 1),
+    conv_stride(1, 1),
+    dilation(1, 1),
+    reference_check(false),
+    measure_performance(true),
+    iterations(20),
+    save_workspace(false),
+    alpha(1),
+    beta(0),
+    benchmark(false) { }
+
+  // Verify the problem size is compatible with the CUTLASS Convolution implementation.
+  bool valid() {
+
+    //
+    // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 8 elements.
+    //
+    int const kAlignment = 8;
+
+    if ((input_size.c() % kAlignment) ||
+      (filter_size.n() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    // Invalid padding
+    if ((padding.h() != filter_size.h() / 2) ||
+      (padding.w() != filter_size.w() / 2)) {
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Updates input and filter sizes
+  void update(
+    cutlass::Tensor4DCoord input_size,
+    cutlass::Tensor4DCoord filter_size) {
+
+    this->input_size = input_size;
+    this->filter_size = filter_size;
+
+    padding.n() = filter_size.h() / 2;
+    padding.h() = filter_size.h() / 2;
+    padding.w() = filter_size.w() / 2;
+    padding.c() = filter_size.w() / 2;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    if (cmd.check_cmd_line_flag("ref-check")) {
+      reference_check = true;
+    }
+
+    if (cmd.check_cmd_line_flag("perf-check")) {
+      measure_performance = true;
+    }
+
+    if (cmd.check_cmd_line_flag("save-workspace")) {
+      save_workspace = true;
+    }
+
+    if (cmd.check_cmd_line_flag("benchmark")) {
+      benchmark = true;
+    }
+
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+    filter_size.c() = input_size.c(); 
+
+    cmd.get_cmd_line_argument("alpha_w", alpha.w());
+    cmd.get_cmd_line_argument("alpha_x", alpha.x());
+    cmd.get_cmd_line_argument("alpha_y", alpha.y());
+    cmd.get_cmd_line_argument("alpha_z", alpha.z());
+
+    cmd.get_cmd_line_argument("beta_w", beta.w());
+    cmd.get_cmd_line_argument("beta_x", beta.x());
+    cmd.get_cmd_line_argument("beta_y", beta.y());
+    cmd.get_cmd_line_argument("beta_z", beta.z());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tag", tag);
+
+    if (filter_size.h() == 3 && filter_size.w() == 3) {
+      padding = {1, 1, 1, 1};
+    }
+    else {
+      filter_size.h() = 1;
+      filter_size.w() = 1;
+      padding = {0, 0, 0, 0};
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "22_quaternion_conv example\n\n"
+      << "  This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
+      << "  forward convolution on tensors of layout NHWC.\n\n"
+      << "Options:\n\n"
+      << "  --help               If specified, displays this usage statement.\n\n"
+      << "  --n <int>            Input tensor extent N\n"
+      << "  --h <int>            Input tensor extent H\n"
+      << "  --w <int>            Input tensor extent W\n"
+      << "  --c <int>            Input tensor extent C\n"
+      << "  --k <int>            Filter extent K\n"
+      << "  --r <int>            Filter extent R\n"
+      << "  --s <int>            Filter extent S\n\n"
+      << "  --alpha <float>      Epilogue scalar alpha\n"
+      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --ref-check          If set (true), reference check on the host is computed\n"
+      << "  --perf-check         If set (true), performance is measured.\n"
+      << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
+      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --save-workspace     If set, workspace is written to a text file.\n"
+      << "  --tag <string>       String to replicate across the first column in the results table\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/22_quaternion_conv/22_quaternion_conv --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
+      << "$ ./examples/22_quaternion_conv/22_quaternion_conv --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
+
+    return out;
+  }
+  
+  /// Computes the output tensor size (NPQK)
+  cutlass::Tensor4DCoord output_size() const {
+    return cutlass::Tensor4DCoord(
+      input_size.n(),
+      (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
+      (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
+      filter_size.n());
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of multiply-adds = NPQK * CRS
+    int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c()) * 16;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Result {
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cutlass::Status reference_check;
+  cudaError_t error;
+
+  Result(): 
+    runtime_ms(0), 
+    gflops(0),
+    status(cutlass::Status::kSuccess),
+    reference_check(cutlass::Status::kInvalid),
+    error(cudaSuccess) { }
+
+  static std::ostream & print_header(std::ostream &out, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << "Name,";
+    }
+
+    out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
+
+    return out;
+  }
+
+  std::ostream & print(std::ostream &out, int idx, Options const &options) {
+
+    if (!options.tag.empty()) {
+      out << options.tag << ",";
+    }
+
+    out 
+      << "conv_" << idx << ","
+      << options.input_size.n() << ","
+      << options.input_size.h() << ","
+      << options.input_size.w() << ","
+      << options.input_size.c() << ","
+      << options.filter_size.n() << ","
+      << options.filter_size.h() << ","
+      << options.filter_size.w() << ","
+      << runtime_ms << ","
+      << gflops;
+
+    return out;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Runs one benchmark
+Result profile_convolution(Options const &options) {
+
+  Result result;
+
+  //
+  // Allocate host-device tensors using the CUTLASS Utilities.
+  //
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
+
+  //
+  // Initialize tensors
+  //
+
+  // Fill tensor A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      7,
+      -8,
+      0);
+
+  // Fill tensor B on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      7,
+      -8,
+      0);
+
+  // Fill tensor C on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_c.host_view());
+
+  // Fill tensor C for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_c.host_view());
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_c.sync_device();
+  tensor_ref_c.sync_device();
+
+  //
+  // Define arguments for CUTLASS Convolution
+  //
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Construct Conv2dProblemSize with user defined output size
+  cutlass::conv::Conv2dProblemSize problem_size(      
+      options.input_size,
+      options.filter_size,
+      options.padding,
+      options.conv_stride,
+      options.dilation,
+      options.output_size(),
+      mode,
+      split_k_slices
+  );
+
+  // Construct ImplicitGemm::Argument structure with conv2d 
+  // problem size, data pointers, and epilogue values
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_c.device_ref(),
+    {options.alpha, options.beta},
+  };
+
+  //
+  // Initialize CUTLASS Convolution
+  //
+
+  ImplicitGemm implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  result.status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(result.status);
+
+  result.status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  result.status = implicit_gemm_op();
+
+  CUTLASS_CHECK(result.status);
+
+  //
+  // Optional reference check
+  //
+  
+  if (options.reference_check) {
+    std::cout << "Verification on host...\n";
+
+    // Compute with reference implementation
+    cutlass::reference::host::Conv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementComputeEpilogue,
+      ElementAccumulator,
+      cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
+    >(
+      problem_size,
+      tensor_a.host_ref(),
+      tensor_b.host_ref(),
+      tensor_c.host_ref(),
+      tensor_ref_c.host_ref(),
+      options.alpha,
+      options.beta
+    );
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    tensor_c.sync_host();
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_c.host_view(),
+      tensor_ref_c.host_view());
+
+    if (!passed) {
+      result.reference_check = cutlass::Status::kErrorInternal;
+      std::cout << "ERROR - results miscompared.\n";
+    }
+    else {
+      result.reference_check = cutlass::Status::kSuccess;
+      std::cout << "Passed.\n";
+    }
+  }
+  else {
+    result.reference_check = cutlass::Status::kInvalid;
+  }
+
+  if (options.save_workspace) {
+
+    std::stringstream ss;
+
+    ss << "22_quaternion_conv_"
+      << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
+      << "_"
+      << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
+      << ".dat";
+
+    std::ofstream output_workspace(ss.str());
+
+    output_workspace 
+      << "Input = \n" << tensor_a.host_view() << "\n\n"
+      << "Filters = \n" << tensor_b.host_view() << "\n\n";
+
+    if (options.reference_check) {
+      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+    }
+
+    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+
+    std::cout << "Results written to '" << ss.str() << "'." << std::endl;
+  }
+  
+  //
+  // Performance measurement
+  //
+
+  if (options.measure_performance) {
+
+    cudaEvent_t events[2];
+    
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return result;
+      }
+    }
+
+    // Record an event at the start of a series of convolution operations.
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Launch a sequence of implicit GEMM operations on the device
+    for (int iteration = 0; iteration < options.iterations; ++iteration) {
+      result.status = implicit_gemm_op();
+      CUTLASS_CHECK(result.status);
+    }
+
+    // Record an event when the convolutions have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Print average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+  }
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.benchmark) {
+    // Benchmark several layers
+
+    int batch_sizes[] = {1, 32, 64, 128, 256, 512};
+
+    struct Benchmark {
+      int h, w, c, k, r, s;
+    } layers[] = {
+      {56,  56,   64,   256, 1, 1},
+      {56,  56,   64,    64, 1, 1},
+      {56,  56,   64,    64, 3, 3},
+      {56,  56,  256,    64, 1, 1},
+      {56,  56,  256,   512, 1, 1},
+      {56,  56,  256,   128, 1, 1},
+      {28,  28,  128,   128, 3, 3},
+      {28,  28,  128,   512, 1, 1},
+      {28,  28,  512,   128, 1, 1},
+      {28,  28,  512,  1024, 1, 1},
+      {28,  28,  512,   256, 1, 1},
+      {14,  14,  256,   256, 3, 3},
+      {14,  14,  256,  1024, 1, 1},
+      {14,  14,  1024,  256, 1, 1},
+      {14,  14,  1024, 2048, 1, 1},
+      {14,  14,  1024,  512, 1, 1},
+      {7,    7,   512,  512, 3, 3},
+    };
+
+    Result::print_header(std::cout, options) << std::endl;
+
+    int idx = 1;
+
+    for (auto const &layer : layers) {
+      for (auto N : batch_sizes) {
+
+        options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
+
+        Result result = profile_convolution(options);
+        result.print(std::cout, idx, options) << std::endl;
+      }
+
+      ++idx;
+    }
+  }
+  else {
+
+    // Execute one problem size
+    if (!options.valid()) {
+      std::cerr << "Invalid problem." << std::endl;
+      return -1;
+    }
+
+    Result result = profile_convolution(options);
+
+    Result::print_header(std::cout, options) << std::endl;
+    result.print(std::cout, 1, options) << std::endl;
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -28,10 +28,14 @@ add_custom_target(test_examples)
 function(cutlass_example_add_executable NAME)

  set(options)
-  set(oneValueArgs)
+  set(oneValueArgs DISABLE_TESTS)
  set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+  if (NOT DEFINED __DISABLE_TESTS)
+    set(__DISABLE_TESTS OFF)
+  endif()
+
  cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})

  add_dependencies(cutlass_examples ${NAME})
@ -60,6 +64,7 @@ function(cutlass_example_add_executable NAME)
    DEPENDEES test_examples ${__DEPENDEES}
    TEST_COMMAND_OPTIONS ${__TEST_COMMAND_OPTIONS}
    DISABLE_EXECUTABLE_INSTALL_RULE
+    DISABLE_TESTS ${__DISABLE_TESTS}
    )

 endfunction()
@ -83,6 +88,11 @@ foreach(EXAMPLE
  15_ampere_sparse_tensorop_gemm
  16_ampere_tensorop_conv2dfprop
  17_fprop_per_channel_bias
+  18_ampere_fp64_tensorop_affine2_gemm
+  19_tensorop_canonical
+  20_simt_canonical
+  21_quaternion_gemm
+  22_quaternion_conv
  )

  add_subdirectory(${EXAMPLE})